#!/usr/bin/perl

#    grun - lightweight jobs queueing system
#    Copyright (C) 2011 Erik Aronesty
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

use strict;

use Carp qw(carp croak confess cluck);
use Getopt::Long qw(GetOptions);
use Data::UUID;

use ZMQ::LibZMQ3;
use ZMQ::Constants ':all';
use JSON::XS;
use Time::HiRes;
use BSD::Resource;

use IO::File;
use POSIX qw(:sys_wait_h strftime);
use Socket qw(IPPROTO_TCP TCP_NODELAY TCP_KEEPIDLE TCP_KEEPINTVL TCP_KEEPCNT);
use Fcntl qw(F_GETFL F_SETFL O_NONBLOCK);
use Safe;
use Cwd qw(abs_path cwd);
use List::Util qw(min max);
use File::Basename qw(dirname);

sub pretty_encode;

our ($REVISION) = (q$LastChangedRevision: 696 $ =~ /(\d+)/);

our $VERSION = "0.9.$REVISION";				# 0.10 wil be feature lock, after sql impl.  0.9.X is zmq & json::xs

my $STATUS_NEVERRUN=199;
my $STATUS_ORPHAN=-8;
my $STATUS_UNKNOWN=-9;
my $STATUS_EXECERR=-10;
my $PPID=$$;
my $WIN32 = ($^O =~ /Win32/);
my $TIMEFMT = 'command:%C\ncpu-real:%E\ncpu-user:%U\ncpu-sys:%S\nmem-max:%M\nmem-avg:%t\nctx-wait:%w\nfs-in:%I\nfs-out:%O';

my ($daemon, $killjob, $editjob);
my (%conf, %def, @metrics, @labels);

# defaults just run things locally, no master
$def{config} = "/etc/grun.conf";		# config file
$def{spool} = "/var/spool/grun";		# dir to place jobs
$def{port} = 5184;				# listen/connect port
$def{bind} = '0.0.0.0';				# listen addr
$def{env} = ['PATH'];			# list of environment vars to copy from submit through to exec
$def{default_memory} = 1000 * 1000;        # default job memory
$def{default_priority} = 20;        # default job priority (20 = always run)
$def{ping_secs} = 30;				# how often to tell about load/mem/stats
$def{remove_secs} = '$ping_secs * 1000';	# don't even try kickstarting if the node is this old
$def{idle_load} = .3;				# how often to tell about load/mem/stats
$def{retry_secs} = 10;				# how often to retry notifications
$def{bench_secs} = 86400;			# how often to re-benchmark
$def{max_buf} = 1000000;			# how often to retry notifications
$def{expire_secs} = 14400;			# remove jobs whose execution nodes haven't reported back in this amount of time
$def{io_keep} = 3600;				# keep io for this long after a job with i/o is finished in a detached session
#$def{hard_factor} = 1.5;				# hard limit factor
$def{max_sched} = 50;				# how many different jobs to try and match before giving up on the rest (queue busy)
$def{spread_pct} = 5;				# how often to "distribute jobs", versus "clump" them
$def{master} = 'localhost:5184';		# central scheduler
$def{services} = "queue exec";			# all can run
$def{pid_file} = "/var/run/grun.pid";		# pid file
$def{log_file} = "/var/log/grun.log";		# pid file
$def{hostname} = $ENV{HOSTNAME} ? $ENV{HOSTNAME} : $ENV{COMPUTERNAME} ? $ENV{COMPUTERNAME} : `hostname`;
$def{log_types} = "note error warn";	# log all
$def{nfs_sync} = 1;                     # enable nfs sync support

chomp $def{hostname};

sub debugging;

my $GRUN_PATH=abs_path($0);

my ($qinfo, $help, $config, $ver);

Getopt::Long::Configure qw(require_order no_ignore_case passthrough);

my $context = zmq_init();

my @ORIG_ARGV= @ARGV;

GetOptions("daemon"=>\$daemon, "CONF:s"=>\$config, "trace"=>\$def{trace}, "query"=>\$qinfo, "V"=>\$ver, "help"=>\$help) ||
	die usage();

(print "grun $VERSION\n") && exit(0) if $ver;

my @send_files;             # files to send

my $safe = new Safe;

$def{config} = $config if $config;

init();

my $stream_quit = 0;

if ($ARGV[0] eq '-X') {
    do_stream();
    exit(0);
}

if ($ARGV[0] eq '-Y') {
    do_execute();
    exit(0);
}


if ($ARGV[0] eq '-?') {
	shift @ARGV;
	$help = 1;
}

$help = 1 if defined $config && !$config;

if ($help) {
	print usage();
	exit 0;
}

if (!$daemon) {
	# -k <id> works as long as -d wasn't specified
	GetOptions("kill"=>\$killjob, "trace"=>\$def{trace}, "edit|e"=>\$editjob) ||
		die usage();
}

if ($conf{debug_memory}) {
	eval {require Devel::Gladiator;};
	die $@ if $@;
}

my $gjobid = slurp("$conf{spool}/nextid");
my $log_to_stderr = 0;

if ($qinfo) {
# this is the code for grun -q

	Getopt::Long::Configure qw(no_require_order no_ignore_case passthrough);
	my %opt;
	GetOptions(\%opt, "silent", "inplace", "hosts=s", "debug") || die usage();
	my $cmd = shift @ARGV;
	die usage() if !$cmd;
    $log_to_stderr = 1 if $opt{debug};

	my @arg = @ARGV;
	$cmd =~ s/^-//;
	my $tmp = substr bestunique($cmd, qw(config status jobs file history wait memory)), 0, 4;
	if (!$tmp) {
		die "Command $cmd is not available, for help type grun -query -?\n";
	}
	$cmd = $tmp;

# some commands default to localhost, others default to queue host... this is confusing... fix?

	my @dest = $opt{hosts} ? expandnodes($opt{hosts}) : 
		$cmd eq 'conf' ? [$conf{bind}, $conf{port}] : 
		[$conf{master},$conf{master_port}];

	if ($cmd eq 'file' && @dest > 1) {
		die "Command $cmd cannot be run on multiple hosts";
	}

    my $ok=0;
	for my $d (@dest) {
		my ($host, $port) = @$d;

        if ($cmd eq 'wait') {
            my $st = 0;
            for (@arg) {
                my ($res) = waitmsg($host, $port, "jwait", $_);
                if ($res && defined $res->{status}) {
                    print "Job $_ status $res->{status}\n";
                    $st = $res->{status} if $res->{status};
                } else {
                    print "Job $_ status $STATUS_UNKNOWN\n";
                    $st = $STATUS_UNKNOWN;
                }
            }
            exit $st;
        } elsif ($cmd eq 'file') {
			my $cwd = cwd;
			my @need;
			for (@arg) {
				next if -e $_;
				if ($_ !~ /^\//) {
					$_ = "$cwd/$_";	
                }
                push @need, $_;
			}
            die "not supported yet\n";
		#	if (@need) {
		#		my ($res, $error) = waitio({inplace=>$opt{inplace}}, $host, $port, "xcmd", 'file', @need);
		#		die $error, "\n" if $error && !$opt{silent};
		#		exit 1 if $error;
		#	}
		} elsif ((!samehost($host,$conf{hostname}) || (!$ENV{_GRUN} && (!$conf{services}->{queue} || $cmd !~ /^stat|jobs|hist$/)))) {
            # this could get ugly, if called a lot, may want to make more efficient
            warn ("waitmsg($host, $port, 'xcmd', $cmd, @arg, @{[%opt]})\n") if $opt{debug};
			my ($ret) = waitmsg($host, $port, "xcmd", $cmd, @arg, %opt);
			print $ret;
            $ok=1 if $ret;
		} else {
            warn ("Using local queue status, host $host is $conf{bind}/$conf{hostip}, name is $conf{hostname} \n") if $opt{debug};
            my $ret;
			if ($cmd eq 'stat') {
				$ret = shownodes(@arg);
			} elsif ($cmd eq 'jobs') {
			    $ret = showjobs(@arg);
			} elsif ($cmd eq 'hist') {
				$ret = showhist(@arg);
			}	
            print $ret;
            $ok=1 if $ret;
		}
	}
    exit($ok ? 0 : 1);
}

my $gpid;						# daemon pid
if (open(IN, $conf{pid_file})) {
	$gpid = <IN>;
	close IN;
}

if ($killjob) {
# grun -k code
	my $sig = 15;
	my $kforce = 0;
	Getopt::Long::Configure qw(no_require_order no_ignore_case);
	GetOptions("signal|n=i"=>\$sig, "force"=>\$kforce) || die usage();

	my $exit = 0;
	for my $job (@ARGV) {
        my @id;
		if ($job !~ /^\d/) {
            @id=(guid=>"$job");
		} else {
            @id=(jid=>$job);
        }
		my $err = kill_job(@id, sig=>$sig, force=>$kforce);
		if (!defined($err) && $@) {
			warn $@,"\n";
			$exit=-1;
		} else {
			my $ok = ($err =~ /^Job.*(aborted|kill requested)/);
            $err =~ s/\n$//;
			warn "$err\n" if $ok; 
			$err = 'No remote response to jkill' if !$ok && !$err;
			warn "Error: $err\n" if !$ok;
			$exit=-1 if !$ok;
		}
	}
	exit 0;
}

if ($editjob) {
	my %ed;
	while (@ARGV) {
		$_=$ARGV[0];
		for (split /,/, $_) {
			my ($key, $val) = $_ =~ /^([^=]+)(?:=(.*))?$/;
			my $nk = bestunique($key, qw(hold resume memory cpus state hosts), @metrics, @labels);
			$key = $nk if $nk;
			$key = 'state', $val = 'hold' if ($key eq 'hold');
			$key = 'state', $val = 'resume' if ($key eq 'resume');
			if ($key eq 'state') {
				$val = substr bestunique($val, qw(hold resume)), 0, 4;
				die "' must be one of: h(old) r(esume)\n" unless $val;
			}
			$ed{$key}=$val;
		}
		shift;
		last unless $ARGV[0] =~ /=/;
	}
	my @jids = @ARGV;

	die usage() if !%ed || !@jids;

	my $ex = 0;
	for my $jid (@jids) {
		warn "Edit " . packdump(\%ed) . "\n";
		my ($err) = waitmsg($conf{master}, $conf{master_port}, 'jedit', $jid, %ed);
		my $ok = ($err =~ /^Job.*edited/);
		$err=~ s/\n$//;
		warn "$err\n" if $ok;
		$err = 'No remote response to jedit' if !$ok && !$err;
		warn "Error: $err\n" if !$ok;
		$ex = 1 if !$ok;
	}
	exit $ex;
}




my ($router, $read_set, $write_set, $quit, %pid_jobs, %j_wait, %io_wait, %start_wait);	# daemon globals
my %ZMQS;                   # hash of open sockets
my %nodes;                  # hash of registered nodes

if ($daemon) {
	startdaemon();
} else {
    grun_client();
}

####################
# client mode

my $make;
my %sync_after;
my %sync_before;
my %sync_already;

sub grun_client {
    my %jobs;
    my %opt;

    $opt{wait} = 1;				# keep socket open until job is finished
    $opt{io} = 1;				# copy io back on the socket, implies wait

    Getopt::Long::Configure qw(require_order no_ignore_case passthrough);

    GetOptions(\%opt, "file=s", "int|I", "memory|m=i", "hosts|h=s", "cpus|c=f", "io!", "wait!", "err_a|err-append|ea=s", "err|e=s", "out|o=s", "out_a|out-append|oa=s", "ouer|out-err|oe=s", "ouer_a|out-err-append|oea=s", "jobx|jobid|j=s", "verbose", "make|M", "debug|D", "env|E=s", "alert=s", "param|p=s@", "wait-exists|W", "priority|r=i");

    if ((!!$opt{out} + !!$opt{out_a} + !!$opt{ouer} + !!$opt{ouer_a})>1) {
        $config=undef;
        die "ERROR: Specify only one of --out, --out-append, --out-err or --out-err-append\n\n" . usage()
    }

    if ((!!$opt{err} + !!$opt{err_a} + !!$opt{ouer} + !!$opt{ouer_a})>1) {
        $config=undef;
        die "ERROR: Specify only one of --err, --err-append, --out-err or --out-err-append\n\n" . usage()
    }

    if (my $t=$opt{out_a}?$opt{out_a}:$opt{ouer_a}) {
        $opt{out}=$t;
        $opt{out_a}=1;
        delete $opt{ouer_a};
    }
    if (my $t=$opt{err_a}?$opt{err_a}:$opt{ouer_a}) {
        $opt{err}=$t;
        $opt{err_a}=1;
        delete $opt{ouer_a};
    }
    if ($opt{ouer}) {
        $opt{out}=$opt{err}=$opt{ouer}; delete $opt{ouer};
    }

    my $verbose = $opt{verbose}; delete $opt{verbose};
    my $env = $opt{env}; delete $opt{env};
    $make = $opt{make}; delete $opt{make};
    $log_to_stderr = 1 if $opt{debug};

    if ($< == 0) {
        die "Won't run a job as root\n";
    }

    if ($opt{err} && $opt{out}) {
        $opt{io} = 0;
    }

    while ($ARGV[0] =~ /^--([\w-]+)=([\w=]+)/) {
        # allow arbitrary job options, that jan later be referred to in match expressions
        # or in execution wrappers, etc
        $opt{$1} = $2;
        shift @ARGV;
    }

    if ($make || $verbose) {
        GetOptions(\%opt, "noexec");
    }

    my @cmd = @ARGV;

    if ($opt{file}) {
        # read options from a file
        funpack($opt{file}, \%opt);
        if ($opt{cmd}) {
            if (@ARGV) {
                die "Can't supply cmd: in the file and '@cmd' on the command line\n";
            }
            if ($opt{cmd} !~ /^[\w0-9:\/\t -]+$/) {
                # not simple: let bash handle it
                @cmd = ('bash', '-c', $opt{cmd});
            } else {
                # simple: split, pass as is to exec
                $opt{cmd} =~ s/^\s+//;
                $opt{cmd} =~ s/\s+$//;
                @cmd = split /\s+/, $opt{cmd};
            }
        }
    }

    # force exec in "same as current dir"
    $opt{cwd} = cwd;
    if (!$opt{cwd}) {
        die "Can't get current working directory, not executing unanchored remote command.\n";
    }

    if ($make) {
        # %i:input %o:output
        my (@i, @o);
        for (@cmd) {
            my @t = m/[#%]([io]):(\S+)/g;
            if (@t) {
                for (my $i=0;$i<@t;++$i) {
                    push @o, $t[$i+1] if $t[$i] eq 'o';
                    push @i, $t[$i+1] if $t[$i] eq 'i';
                }
                s/%[io]:(\S+)/$1/g;
                s/#[io]:\S+//g;
            }
            my @t = m/([<>])\s*(\S+)/g;
            if (@t) {
                for (my $i=0;$i<@t;$i+=2) {
                    push @i, $t[$i+1] if $t[$i] eq '<' && $t[$i+1]=~/^\s*\w/;
                    push @o, $t[$i+1] if $t[$i] eq '>' && $t[$i+1]=~/^\s*\w/;
                }
            }
            s/\%\!\>(>?\s*\S+)/>$1/g;
        }
        die "Unable to determine i/o for -M (make) semantics\n\n" . usage()
            if !@i || !@o;

        my $need=0;
        for my $i (@i) {
            syncfile($i);
            add_syncfile_before($i);
            for my $o (@o) {
                syncfile($o);
                if (! (-s $o) || (fmodtime($i) > fmodtime($o))) {
                    warn "# need $o\n" if $opt{noexec} || $verbose;
                    $need=1;
                }
            }
        }
        if (!$need) {
            warn "Skipping: @cmd\n";
            exit 0;
        }
        for (@o) {
            add_syncfile_after($_);
        }
        warn "+@cmd\n" if $opt{noexec} || $verbose;
    } else {
        for (@cmd) {
            if (m{([^\s,:]+)/([^\s,:]+)}) {
                add_syncfile_after($_);
                add_syncfile_before($_);
            }
        }
        add_syncdir_after($opt{cwd});
        add_syncdir_before($opt{cwd});
    }

    if ($ARGV[0] =~ /^-/) {
        die "Unknown option $ARGV[0]\n";
    }


    #die pretty_encode \@cmd  if $opt{debug};

    if (!@cmd) {
        die usage();
    }

    if ($cmd[$#cmd] =~ /\&$/) {
        # TODO: grun should wait for all kids, and disallow detaching, not just get rude here
        die "Not running background-only job.  You might mean: grun \"command\" &.\n";
    }

    if ($conf{auto_profile}) {
        if (-e ($conf{auto_profile})) {
            my $cmd = join ' ', @cmd;
            # safe'ish eval, just so there aren't weird side effects
            my ($cpu, $mem, $prof) = evalctx(slurp($conf{auto_profile}) . ";\nreturn (\$cpu, \$mem, \%prof);", cmd=>$cmd, cmd=>\@cmd);

            $prof = $cpu if ref($cpu) eq 'HASH';

            # alias names
            $prof->{cpus}=$prof->{cpu} if !$prof->{cpus} && $prof->{cpu};
            $prof->{memory}=$prof->{mem} if !$prof->{memory} && $prof->{mem};

            if ($prof && ref($prof) eq 'HASH') {
                $prof->{memory} = $mem if defined($mem) && !ref($mem) && !$prof->{memory};
                $prof->{cpus} = $cpu if defined($cpu) && !ref($cpu) && !$prof->{cpus};

                $opt{memory}=$prof->{memory} if $prof->{memory} && !$opt{memory};
                $opt{cpus}=$prof->{cpus} if $prof->{cpus} && !$opt{cpus};
                $opt{hosts}=$prof->{hosts} if $prof->{hosts} && !$opt{hosts};
                $opt{priority}=$prof->{priority} if $prof->{priority} && !$opt{priority};

                for ((@metrics,@labels)) {
                    # as if the user entered it
                    next if !$_;
                    push @{$opt{"param"}}, "$_=" . $prof->{$_} if defined($prof->{$_});
                }
            } else {
                $opt{memory}=$mem if ($mem > $opt{memory});
                $opt{cpus}=$cpu if ($cpu > $opt{cpus});
            }

            if ($@) {
                die "Can't run $conf{auto_profile}: $@\n";
            }
        } else {
            die "Can't find $conf{auto_profile}: $!\n";
        }
    }

    my %param;
    if ($opt{param}) {
        for (@{$opt{param}}) {
            if (/^([^=]+)=(.*)/) {
                $param{$1} = $2; 
            } else {
                die "Parameter $_: should be name=value\n";
            }
        }
    }
    $opt{param} = \%param;

    $opt{priority} = $ENV{GRUN_PRIORITY}+0 if !$opt{priority} && $ENV{GRUN_PRIORITY} >= 1;
    $opt{priority} = $conf{default_priority} if !$opt{priority};

    # convert memory to kB
    if ($opt{memory}) {
        if ($opt{memory} =~ /kb?$/i) {
            # internal memory unit is kB
        } elsif ($opt{memory} =~ /gb?$/i) {
            # convert gB to kB
            $opt{memory} *= 1000000;
        } else {
            # convert mB to kB
            $opt{memory} *= 1000;
        }
    } else {
        $opt{memory} = $conf{default_memory};
    }

    # no socket io unless waiting
    if (!$opt{wait}) {
        delete $opt{io};
    }


    # copy env
    if ($env eq '*' || ($conf{env} && ($conf{env}->[0] eq '*')) ) {
        for (keys %ENV) {
            if (! /^_|LS_COLORS/) {
                $opt{env}->{$_} = $ENV{$_};
            }
        }
    } else {
        for (@{$conf{env}}) {
            $opt{env}->{$_} = $ENV{$_} if defined $ENV{$_};
        }
        for (split /\s+/, $env) {
            $opt{env}->{$_} = $ENV{$_} if defined $ENV{$_};
        }
    }

    $opt{user} = getpwuid($>);
    $opt{group}=$);
    $opt{umask} = umask();
    $opt{env}->{USER} = $opt{user};

    if (!$opt{wait}) {
        open STDERR, ">&STDOUT";
    }

    $opt{memory} = $opt{memory} ? $opt{memory} : $conf{default_memory};
    $opt{cpus} = $opt{cpus} ? $opt{cpus} : 1;

    for (@metrics) {
        if ($conf{"default_$_"}) {
            $opt{$_} = $opt{$_} ? $opt{$_} : $conf{"default_$_"};
        }
    }

    if ($verbose) {
        printf STDERR "Memory:	%d\n", $opt{memory};
        printf STDERR "CPUs:	%d\n", $opt{cpus};
        printf STDERR "Hosts:	%s\n", $opt{hosts} if $opt{hosts};
        for ((@metrics,@labels)) {
            printf STDERR proper($_) . ":	%s\n", $opt{param}->{$_} ? $opt{param}->{$_} : 1;
        }
    }

    if ($opt{jobx}) {
        if ($opt{jobx} =~ /^\d/) {
            die "External job id's should start with a non-numeric\n";
        }
    }

    my %info;

    sub client_sigh {
        my $signame = shift;
        $SIG{INT} = undef;
        $SIG{TERM} = undef;
        $SIG{PIPE} = undef;
        if ($info{jid}||$info{guid}) {
            if (!($signame eq 'PIPE')) {
                print STDERR "Aborting command, sending jkill for $info{jid}\n";
            }
            my $code = $signame eq 'INT' ? 2 : $signame eq 'PIPE' ? 13 : 15;
            my $err = kill_job(jid=>$info{jid}, guid=>$opt{guid}, sig=>$code, termio=>1);
            if (!($signame eq 'PIPE')) {
                if (!defined($err) && $@) {
                    warn $@,"\n";
                }
            }
            exit 128+$code;
        } else {
            die "Interrupted before job sent\n";
        }
    };

    # for testing -M make
    exit 0 if $opt{noexec};

    $SIG{INT} = \&client_sigh;
    $SIG{TERM} = \&client_sigh;
    $SIG{PIPE} = \&client_sigh;

    $opt{cmd} = \@cmd;
    $opt{hard_factor} = $conf{hard_factor};
    $opt{frompid} = $$;
    $opt{guid} = $opt{jobx} ? $opt{jobx} : create_guid();
    $opt{syncdirs} = [keys(%sync_before)];
    %info = waitmsg($conf{master}, $conf{master_port}, 'run', \%opt);

    if (!%info) {
        die ($@ ? $@ : "No response to 'run'") . "\n";
    }

    if ($info{error}) {
        print STDERR $info{error}, "\n";
        exit -1;
    }

    if (!$info{jid}) {
        print STDERR "Failed to submit job", "\n";
        exit -1;
    }
    my $save_jid = $info{jid};

    $0 = "GRUN:$save_jid";

    if ($verbose) {
        printf STDERR "Job_ID:	$info{jid}\n";
    }

    if ($info{already}) {
        if ($opt{"wait-exists"}) {
            my ($res) = waitmsg($conf{master},$conf{master_port}, "jwait", $info{jid});
            my $st;
            if ($res && defined $res->{status}) {
                print "Job $_ status $res->{status}\n";
                $st = $res->{status} if $res->{status};
            } else {
                print "Job $_ status $STATUS_UNKNOWN\n";
                $st = $STATUS_UNKNOWN;
            }
            exit $st;
        } else {
            print STDOUT "Job_ID: $info{jid} \n" if !$verbose;
            printf STDERR "Already running job named $opt{jobx}, try grun -q wait JOB\n";
            exit -1; 
        }
    }

    if ($opt{wait}) {
        # wait for a job ip
        while (!defined($info{ip})) {
            my %tmp = waitmsg_retry($conf{retry_secs}*1000, $conf{master}, $conf{master_port}, 'jinfo', $info{jid});

            if ($tmp{error}) {
                print STDERR $tmp{error}, "\n";
                exit $tmp{status} != 0 ? $tmp{status} : -1;
            }

            if (!defined($tmp{ip})) {
                xlog("error","Had to retry after a retry... BUG in job $save_jid\n");
                sleep(5);
            } else {
                %info=%tmp;
            }
        }

        my $host = $info{hostname} ? $info{hostname} : $info{ip};           # support old ver which didn't have hostname

        if ($verbose) {
            printf STDERR "Host:	%s\n", $host;
        }

        # look at all the work you can avoid if you don't wait
        my ($stat, $err, $diderr);
        # connect to executing node directly and ask for stderr, stdout, and status based on job id
        if (defined $info{status}) {
             $stat=$info{status};
             $err=$info{error};
        }
        while (!defined $stat && !defined $err) {
            # shadow watcher....
            if ($info{ip}=~/^\d/) {
                if ($opt{io} && !($opt{err} && $opt{out}) ) {
                    ($stat, $err, $diderr) = waitio($info{ip}, $info{port}, $info{jid}, \%opt);
                } else {
                    my ($key, $dat) = waitmsg_retry($conf{retry_secs}*1000, $info{ip}, $info{port}, 'xstat', $info{jid});
                    $stat=$dat->{status};
                    $err=$dat->{error};
                }
                if ($stat == 65280 && !$err) {
                    print STDERR "Error: [$host] Command returned -1\n";
                } 
            } else {
                $stat=$STATUS_UNKNOWN;
                $err="Error in grun protocol (ip=$info{ip}), unknown status!\n";
            }
            sleep 5 if (!defined $stat && !defined $err);
        }

        # send this command into the ether...if exec doesn't get it, clean up after timeout
        if ($opt{io}||$opt{wait}) {
            if ($info{ip}=~/^\d/) {
                sendcmd($info{ip}, $info{port}, 'xclean', $info{jid});
            }
        }
        if ($stat == 11) {
            $err = 'Segmentation fault';
        }
        if ($stat > 127) {
            # shift... but if that doesn't get you anything, set to -1 (unknown error)
            $stat = $stat>>8;
            $stat = -1 if !$stat;
        }
        syncafter();
        if ($err) {
            print STDERR "[$host] $err", "\n";
            $stat = -1 if !$stat;				# unknown error if undefined
        } else {
            if ($stat != 0 && ! $diderr) {
                print STDERR "Error: [$host] Command returned $stat\n";
            }
        }
        unlink("$ENV{HOME}/.grun/jobx/$opt{jobx}") if $opt{jobx};

        exit($stat);
    } else {
        # aah... nicer
        print STDOUT "Job_ID: $info{jid} \n" if !$verbose;
    }
}

### nfs sync support

sub syncdirs {
    for (@_) {
        syncdir($_);
    }
}

sub syncafter {
    %sync_already = ();
    return unless $conf{nfs_sync};
    syncdirs(keys(%sync_after));
}

sub add_syncdir_after {
    my ($d) = @_;
    return unless $conf{nfs_sync};
    $sync_after{abs_path($d)}=1;
}

sub add_syncdir_before {
    my ($d) = @_;
    return unless $conf{nfs_sync};
    $sync_before{abs_path($d)}=1;
}

sub add_syncfile_before {
    my ($f) = @_;
    return unless $conf{nfs_sync};
    add_syncdir_before(-d $f ? $f : dirname($f));
}

sub add_syncfile_after {
    my ($f) = @_;
    return unless $conf{nfs_sync};
    add_syncdir_after(-d $f ? $f : dirname($f));
}

sub syncfile {
    my ($f) = @_;
    return unless $conf{nfs_sync};
    syncdir(-d $f ? $f : dirname($f));
}

# this refreshes the lookupcasche, which is an issue when running scripts back to back on multiple nodes using NFS
sub syncdir {
    my ($d) = @_;
    my $tmp;
    return if $sync_already{$d};
    opendir($tmp,$d);
    closedir($tmp);
}

sub readsocks {
    my $did;
    # identity & data received
    my $cnt;

    if (!$router) {
        xlog("debug", "Readsocks called with no router: " . Carp::longmess() );
        return;
    }

    while (my $id= zmq_recvmsg($router,ZMQ_NOBLOCK)) {
        ++$cnt;
        $did=1;
        $id = zmq_msg_data($id);

        if (zmq_getsockopt($router,ZMQ_RCVMORE)) {
#            print "getting sep\n";
            my $sep  = zmq_recvmsg($router);
        }
        if (zmq_getsockopt($router,ZMQ_RCVMORE)) {
#            print "getting data\n";
            my $msg  = zmq_recvmsg($router);
            my $data = zmq_msg_data($msg);
            my @resp = process_message($data, $id);
            if (@resp) {
                print("Got resp: @resp\n");
                replymsg($id, @resp);
            } else {
#                warn("No resp for $data\n");
            }
        }
        while (zmq_getsockopt($router,ZMQ_RCVMORE)) {
                print("Discarding excess multipart data!\n");
        }
        if ($cnt > 100000) {
            xlog("error", "Getting flooded with messages");
            last;
        }
    }
    return $did;
}

sub jhiststat {
    my ($jid) = @_;
    my $jhistfile=jhistpath($jid);
    if (-e $jhistfile) {
        # job is toast... but maybe some streams are waiting
        my $job=unpack_file($jhistfile);
        delete $job->{env};
        $job->{host}="n/a" if !$job->{host} && !defined $job->{status};
        $job->{ip}=host2ip($job->{host}) if !$job->{ip} && ! defined $job->{status};
        # needs ip!
        $job->{ip}="n/a" if defined $job->{status} && ! $job->{ip};
        $job->{hostname}=$job->{host};
        return $job;
    } 
    return undef;
}

sub replymsg {
    my ($id, @resp) = @_;
    if (debugging) {
        my $hid=unpack("h*",$id);
        xlog("debug", "Reply ($hid) " . packdump(\@resp) . "\n") if $conf{trace};
    }
    zmq_send($router, $id, length($id), ZMQ_SNDMORE);
    zmq_send($router, "", 0, ZMQ_SNDMORE);
    zmq_send($router, packref(\@resp));
}

sub selfcmd {
	my $cmd = packcmd(@_);
	process_message($cmd, undef);
}

my $debugzid;
sub process_message {
	my ($src_data, $zid) = @_;

	my ($ip, $trace, $cmd, @args) = unpackcmd($src_data);

    $trace=$conf{trace} if !$trace;

    if (debugging|$trace) {
        my $hid=defined($zid) ? unpack("h*",$zid) : "";
	    xlog($trace ? "trace" : "debug", "Received command ($hid) '$cmd' : $src_data\n") if $trace;
    }

	return ('error'=>$@) if ($@);

	if ($cmd eq 'xcmd') {
# these commands 'query or interfere' with normal running of the server 
# they are initiated by a user	
# they are limped together like this for basically no reason

		if ($args[0] eq 'relo') {
# reread config... maybe rebind stuff too
			xlog("note", "Reload from remote command (ARGS: @ORIG_ARGV)");
	        eval{init();};
            if ($@) {
			    return "Error: $conf{config}, $@";
            } else {
    			return "Ok, reloaded from $conf{config}";
            }
		} elsif ($args[0] eq 'term') {
			xlog("note", "Shutdown from remote command");
			$quit = 1;
			return 'Ok, shutdown initiated';
		} elsif ($args[0] eq 'rest') {
			xlog("note", "Restarting from remote command  ($GRUN_PATH @ORIG_ARGV)");
			$quit = 1;
            zmq_unbind($router, "tcp://$conf{bind}:$conf{port}");
			if (!fork) {
                zmq_fork_undef();
				exec($GRUN_PATH, @ORIG_ARGV);
			}
			return "Ok, restart initiated";
		} elsif ($args[0] eq 'stat') {
			shift @args;
			return shownodes(@args);
		} elsif ($args[0] eq 'hist') {
			shift @args;
#            if (@args && (@args > 1 || $args[0] !~ /^\d+$/)) {
                # fork... do it in parallel
#                forkandgo($zid, \&showhist, @args);
#                return();
#            } else {
                # inline... do it now, less expensive than fork!
    			return showhist(@args);
#            }
		} elsif ($args[0] eq 'conf') {
			return showconf();
		} elsif ($args[0] eq 'memo') {
			return showmem();
		} elsif ($args[0] eq 'jobs') {
			shift @args;
			return showjobs(@args);
#        } elsif ($args[0] eq 'file') {
#            shift @args;
#            xlog("note", "Sending file [@args] to remote");
#            return ($SOCK_FILE, @args);
		} else {
			return "Error: unknown xcmd '$args[0]'";	
		}
    } elsif ($cmd eq 'frep') {
#		warn("router is $router, zid is $debugzid\n");
        my ($dat) = @args;
        if ($dat->{zid}) {
            xlog("debug", "Sending response to $dat->{zid}");
		    replymsg(pack("h*",$dat->{zid}),$dat->{out},$dat->{more});
		    #replymsg($debugzid,$dat->{out});
        }
	} elsif ($cmd eq 'node') {
# this is the 'node ping'
		if (ref($args[0]) eq 'HASH') {		# bit of validation
            my $node = $args[0];

            $node->{ip}=$ip unless $node->{ip};
            $ip=$node->{ip};

            if ($ip) {
                my $file = "$conf{spool}/nodes/$ip.reg";
                open(F, ">$file") || return("Error: can't create $file : $!");
                print F packfile($node);
                close F;
    # also stick in memory

                if (!$nodes{$ip}) {
                    xlog("note", "Registering node $ip:$node->{port} $node->{hostname}");
                }

                $node->{ping} = time();
                $node->{ex_ping} = $nodes{$ip}->{ping};
                $node->{zid} = $zid;

                $nodes{$ip} = $node;
            } else {
                    xlog("note", "Can't register node with no ip");
            }
# save execution node ping time for diagnostic (times out of sync or very slow transmission, etc)
		} else {
			return "Error: invalid node registration info";	
		}
		return ();
	} elsif ($cmd eq 'xedit') {
		my ($jid, $ed) = @args;
        if ($ed->{state} =~ /hold|resu/) {
            my $job = unpack_file("$conf{spool}/jpids/$jid");

            if ($job && $job->{pid}) {
                my ($sig, $stat);
                if ($ed->{state} eq 'hold') {
                    $sig=-19; $stat="susp";
                } else {
                    $sig=-18; $stat="";
                }
                kill($sig, $job->{pid});
                sendcmd($conf{master},$conf{master_port}, 'jedit', $job->{id}, state=>$stat);
            }
        }
	} elsif ($cmd eq 'jedit') {
		my ($jid, %ed) = @args;
        my $fname;
		if (!$jid || !%ed) {
			xlog("error", "Invalid edit request (@args)");
			return "Invalid edit request (@args)\n";
		} elsif (! -e "$conf{spool}/jobs/$jid") {
			if (! -e "$conf{spool}/jobs/$jid.ip") {
				xlog("error", "Job $jid not found during jedit");
				return "Job $jid not found during jedit";
			} else {
				my $job_ip = slurp("$conf{spool}/jobs/$jid.ip");
                # send an 'xedit' to the running node
                # needed only for certain edits....think about this!
                $fname = "$conf{spool}/jobs/$jid:$job_ip.run";
                if ($ed{state} =~ /hold|resu/) {
                    if ($nodes{$job_ip}) {
                        sendcmd_nowait($job_ip, $nodes{$job_ip}->{port}, 'xedit', $jid, \%ed);
                    }
                }
                # for now we allow edit-in-place.... for no effect in some cases, but not others
			}
		} else {
            $fname = "$conf{spool}/jobs/$jid";
        }
       
        # assert($fname) 
        my $ref = unpack_file($fname);
        for my $key (keys(%ed)) {
            $ref->{$key} = $ed{$key};
        }
        burp("$conf{spool}/jobs/$jid.jedit", packfile($ref));
        rename("$conf{spool}/jobs/$jid.jedit", $fname);
        return "Job $jid edited\n";
    } elsif ($cmd eq 'jwait') {
        my ($job) = @args;
 
        # user can specify guid or jid
        my $jid = jid_from_opts($job=~/^\d/?{jid=>$job}:{guid=>$job});
        if (!$jid) {
            return {error=>"Job $job not found"};
        }
        if ( -e "$conf{spool}/jobs/$jid" || -e "$conf{spool}/jobs/$jid.ip" ) {
            $j_wait{$jid}->{$zid}=time();
            return ();
        } else {
            my $jhistfile=jhistpath($jid);
            if (-e $jhistfile) {
                # repeated acks are ok
                my $job=unpack_file($jhistfile);
                if ($job) {
                    return $job;
                } else {
                    return {error=>"Invalid job history $jhistfile"};
                }
            } else {
               return {error=>"Job not running, and has no history"};
            }
        }
	} elsif ($cmd eq 'jkill') {
# this is the 'job kill command'
		my ($job) = @args;

        # user can specify guid or jid
        my $jid = jid_from_opts($job);
        if (!$jid) {
            if ($job->{guid}) {
                return "Job $job->{guid} not found\n";
            } else {
                return "No job specified for jkill\n";
            }
        }
		if (! -e "$conf{spool}/jobs/$jid") {
			if (! -e "$conf{spool}/jobs/$jid.ip") {
				xlog("error", "Job $jid not running or queued, but kill requested");
				return "Job $jid not running or queued";
			} else {
# send xabort to correct node
				my $job_ip = slurp("$conf{spool}/jobs/$jid.ip");
				if (!$job_ip) {
					xlog("error", "Job $jid empty ip!");
                    my $job = unpack_file("$conf{spool}/jobs/$jid");
                    $job->{error}="Unknown state";
                    archive_job($jid, $job, $STATUS_UNKNOWN, undef, undef);
			        return "Job $jid, unknown state";
				} else {
					if ($job->{force}) {
						xlog("error", "Job $jid forced kill");
                        my $job = unpack_file("$conf{spool}/jobs/$jid:$job_ip.run");
						if ($nodes{$job_ip}) {
                            sendcmd_nowait($job_ip, $nodes{$job_ip}->{port}, 'xabort', $jid, 9);
                        }
                        $job->{error}="Forced kill";
                        archive_job($jid, $job, $STATUS_UNKNOWN, undef, undef);
						return "Job $jid forced kill\n";
					} else {
						if ($nodes{$job_ip}) {
							return "Forward $jid $job_ip:" . $nodes{$job_ip}->{port};
						} else {
							return "Job $jid, node $job_ip not online, can't kill\n";
						}
					}
				}
			}
		} else {
            my $job = unpack_file("$conf{spool}/jobs/$jid");
            $job->{error}="Killed";
            archive_job($jid, $job, $STATUS_NEVERRUN, undef, undef);
			return "Job $jid aborted";
		}
	} elsif ($cmd eq 'jstat') {
# sent by the execution node to say what the status of a job is
		my %opts = %{$args[0]};
		if (my $jid = $opts{id}) {
            my $should = slurp("$conf{spool}/jobs/$opts{id}.ip");

            if (! -e "$conf{spool}/jobs/$opts{id}:$ip.run") {
                if (!$should) {
                    # this could be a repeat, and that's ok
                    xlog("debug", "Probably already got 'jstat' for $jid");
                } else {
                    if ($opts{pid}<0) {
                        xlog("error", "Orphaned job report $jid, from $ip for $should, status: $opts{status}");
                        $ip=$should if $opts{status} == $STATUS_ORPHAN;
                    } else {
                        if (!($ip eq $should)) {
                            xlog("error", "Got a report ($opts{status},$opts{jrun}) for job $jid from $ip, should be $should");
                        } else {
                            xlog("error", "Got a report ($opts{status},$opts{jrun}) for job $jid, but there's no $jid:$ip.run file");
                        }
                    }
                }
            }
			if ($opts{jrun}) {
# just a ping
				xlog("trace", "Still alive $jid from $ip") if $trace;
				touch("$conf{spool}/jobs/$jid:$ip.run")
					if -e "$conf{spool}/jobs/$jid:$ip.run";
                if ( ! -e "$conf{spool}/jobs/$jid.ok" ) {
                    # jexok didn't come through, but should have
	                xlog("error", "Writing $jid.ok, from jrun signal, may need to restart exec node");
                    burp("$conf{spool}/jobs/$jid.ok","jrun");
                }
    			return();                           # no dacks for jrun
			} elsif (defined $opts{status}) {
# job is done
               if ( -e "$conf{spool}/jobs/$jid:$ip.run" ) {
                   my $job = unpack_file("$conf{spool}/jobs/$jid:$ip.run");
                   if ($job) {
                        if (my $n=$nodes{$ip}) {
				            xlog("debug", "Clearing cpu: $job->{cpus}, memory: $job->{memory} from $ip");
                            $n->{a_cpus} -= $job->{cpus};                   # deallocate
                            $n->{a_memory} -= $job->{memory};               # deallocate
                            for (@metrics) {
                                $n->{"a_$_"} -= $job->{param}->{$_};
                            }
                            # TODO: probably would be nice, in a jstat, to include metrics so all these aren't guessing

                            $n->{avail} += $job->{cpus};                    # return to avail, until %node is updated
                            $n->{mem} += $job->{memory};                    # return mem
                            $n->{avail} = max($n->{avail},min($n->{cpus},$n->{orig_cpus}-$n->{load}));          # better guess
                        } else {
				            xlog("error", "Got a report for $jid, from $ip, but there's no node like that");
                        }
                        archive_job($jid, $job, $opts{status}, $ip, \%opts);
                   } else {
				        xlog("debug", "Bad job file $conf{spool}/jobs/$jid:$ip.run");
                   }
                } else {
                    my $jhistfile=jhistpath($jid);
                    if (-e $jhistfile) {
                        # repeated acks are ok 
				        xlog("debug", "Got a duplicate report from $conf{master} for job $jid ($jhistfile)");
                    } else {
				        xlog("error", "Got a report for an unknown job $jid, from $ip, status: $opts{status}");
                    }
                }
    			sendcmd_nowait($ip, $opts{replyport}?$opts{replyport}:$conf{port}, 'dack', jid=>$jid);
			} else {
				xlog("error", "Got a report for a job $jid with no status info ($src_data)");
			}
# return ack even if not exists
		}
	} elsif ($cmd eq 'dack') {
# ackknowledge receipt of status signal, so you don't have to do it again
		my %dack = @args;
		xlog("debug", "Got dack for $dack{jid}") if $trace;
		if ($dack{jid}) {
			if (!$dack{jrun}) {
                if (!$io_wait{$dack{jid}} || !$io_wait{$dack{jid}}->{streams}) {
                    exec_clean($dack{jid});
               } else {
		            xlog("debug", "Still waiting on i/o: " . packdump($io_wait{$dack{jid}}) . ", will clean later") if $trace;
                    # ready to clean up
                    burp("$conf{spool}/jstat/$dack{jid}.dack",1);
                }
			}
		}
	} elsif ($cmd eq 'jexok') {
        my ($jex) = @args;
        my $jid = $jex->{id};
	    xlog("debug", "Writing $jid.ok") if $trace;
        touch("$conf{spool}/jobs/$jid:$ip.run");
        burp("$conf{spool}/jobs/$jid.ok",packfile($jex));
	} elsif ($cmd eq 'xexec') {
		my ($opts) = @args;
        execute_job($opts);
	} elsif ($cmd eq 'xclean') {
# cleanup iowait stuff.. i got everything
		my ($jid) = @args;
        xlog("debug", "Client is finished with $jid\n");
        delete $io_wait{$jid};
        delete $start_wait{$jid};
        if (-e "$conf{spool}/jstat/$jid.dack") {
            exec_clean($jid);
        }
	} elsif ($cmd eq 'xabort') {
# kill job
		my ($jid, $sig, $termio) = @args;

        if ($io_wait{$jid} && $io_wait{$jid}->{streams}) {
            xlog("debug", "Alerting streams that $jid is dead\n");;
            for(values %{$io_wait{$jid}->{streams}}) {
                replymsg($_, "quit");
            }
            delete $io_wait{$jid}->{streams};
            touch("$conf{spool}/jstat/$jid.dumped");
        }

		my $job = unpack_file("$conf{spool}/jpids/$jid");
        xlog("debug", "Found job $jid with pid $job->{pid}\n");;
		if ((my $pid = $job->{pid}) > 1) {
            $sig = 2 if $sig !~ /^\d+$/;                # force sig to 2 if not a number
# kill the whole shebang
			my $ok = kill(-$sig, $pid);
			xlog("note", "Kill ($sig) job $jid (pgrp:$pid), return: $ok");

			if ($ok) {
                # report status as "killed"
                selfcmd("sstat", {id=>$jid,status=>127+$sig,error=>"Job $jid aborted",dumped=>1});
				return "Job $jid aborted"; 
			} else {
				return "Job $jid kill $sig failed : $!"; 
			}
            if ($io_wait{$jid} && $io_wait{$jid}->{zid}) {
                # tell waiters it's dead, and streams are dumped
                replymsg($io_wait{$jid}->{zid},"$jid:stat",{status=>127+$sig, error=>"Job $jid aborted", dumped=>1});
            }
		} else {
			return "Job $jid not found for xabort";
		}
	} elsif ($cmd eq 'xio') {
        my ($jid) = @args;
        my $ready = 0;
        my $known = 0;
        if ($io_wait{$jid} && $io_wait{$jid}->{streams}) {
            # definitely not dumped
            $ready = 1;
            for(values %{$io_wait{$jid}->{streams}}) {
                replymsg($_, "ready");
            }
            delete $io_wait{$jid}->{streams};
            $known = 1;
        }

        # set wait flag, if i/o wasn't dumped
        my $stat=getjobstathash($jid);

        # streams are ready, or job isn't done, or io wasn't dumped
        if ($ready||!$stat||!$stat->{dumped}) {
            my $end_time;
            # job is done
            if ($stat && $stat->{rtime}) {
                $end_time = (stat("$conf{spool}/jstat/$jid.stat"))[9]+$stat->{rtime};
            }
            # finished a while ago
            if ($stat && ($end_time < (time()-(5*$conf{ping_secs}))) ) {
                xlog("error", "Dumping i/o for completed job $jid because streamer hasn't responded in $def{ping_secs} secs, end-time: $end_time");
                touch("$conf{spool}/jstat/$jid.dumped");
                $stat->{dumped}=1;
            } else {
                xlog("debug", "Creating io_wait hash entry for $jid, (E:$end_time, $stat)") unless $io_wait{$jid}->{zid} eq $zid;
                $io_wait{$jid}->{type} = 'io';
                $io_wait{$jid}->{zid} = $zid;
                $io_wait{$jid}->{time} = time();        # toss this entry if it gets oldi
                $known = 1;
            }
        }

        if ($stat) {
            if ($stat->{dumped} && $ready ) {
                xlog("error", "Dumped stream $jid before ready received");
            }
            xlog("debug", "Returning $jid:stat for $jid");
            return("$jid:stat", $stat);
        }
        if (!$known) {
            # unknown... no iowait, no stat
            xlog("debug", "Returning $jid:unk for $jid, no io_wait set");
            return("$jid:unk");
        } else {
            # iowait ready
            return()
        }
	} elsif ($cmd eq 'xstat') {
		my ($jid) = @args;

        if ($io_wait{$jid} && $io_wait{$jid}->{streams}) {
            for(values %{$io_wait{$jid}->{streams}}) {
                replymsg($_, "quit");
            }
            delete $io_wait{$jid}->{streams};
        }

        my $stat=getjobstathash($jid);
        return("$jid:stat", $stat) if $stat;

        # only set wait flag if not found
        xlog("debug", "Creating io_wait hash 'xstat' entry for $jid");

        $io_wait{$jid}->{type} = 'stat';
        $io_wait{$jid}->{zid} = $zid;
        $io_wait{$jid}->{time} = time();		# toss this entry if it gets old

		return ();				# wait for response
	} elsif ($cmd eq 'jinfo') {				# tell me what host a job is on
		my ($jid) = @args;
		if (!$jid) {
            xlog("error", "Job '$jid' does not exist from $ip");
            return(error=>"Job '$jid' does not exist from $ip");
        }
		if (! -e "$conf{spool}/jobs/$jid") {
			if (! -e "$conf{spool}/jobs/$jid.ip") {
                my $jhist=jhiststat($jid);
			    if (! $jhist) {
				    xlog("error", "Job '$jid' does not exist from $ip") if $jid;
    				return (error=>"Job '$jid' does not exist during jinfo.");
                } else {
                    return(%$jhist);
                }
			} else {
				my $job_ip = slurp("$conf{spool}/jobs/$jid.ip");
				if (!$job_ip) {
					xlog("error", "No ip for job $jid");
                    return ();
				} else {
# route to correct node
                    if (!$nodes{$job_ip}) {
                        my $jhist=jhiststat($jid);
                        if (!$jhist) {
                            xlog("error", "Job $jid is linked to $job_ip which is not responding");
                            return (warn=>"Job $jid is linked to $job_ip which is not responding");
                        } else {
                            return(%$jhist);
                        }
                    } else {
                        return (jid=>$jid, ip=>$job_ip, port=>$nodes{$job_ip}->{port}, hostname=>$nodes{$job_ip}->{hostname});
                    }
                }
			}
		} else {
            # wait for job start
			$start_wait{$jid}->{zid} = $zid;			# set router id
			$start_wait{$jid}->{time} = time();			# refresh timestamp
            return ();                    # wait for response
		}
	} elsif ($cmd eq 'run') {
# user command, returns jid=>jobid [, error=>string]
		if (!$conf{services}->{queue}) {
			return (error=>"No queue service running on this host");	
		} else {
			my $time = time();
			++$gjobid;
			burp("$conf{spool}/nextid", $gjobid);

# the job file
			my $jid = $gjobid;
			my $file = "$conf{spool}/jobs/$jid";
            my $job = $args[0];
			my $gfile = "$conf{spool}/guids/$job->{guid}";

# stick the ip the job came from in the options
            $job->{time}=time();
            $job->{fromip}=$ip;
            $job->{trace}=$trace;

            if ( -e $gfile) {
			    return (jid=>slurp($gfile), already=>1);
            }
			xlog("debug", "Created $file\n") if $trace;
			open(G, ">$gfile") || return('error'=>"Can't create $gfile : $!");
			open(F, ">$file") || return('error'=>"Can't create $file : $!");
			print F packfile($job);
			close F;

			print G $jid;
			close G;
			return (jid=>$jid);
		}	
	} elsif ($cmd eq 'sstat') {
        my ($stat) = @args; 
        notifystat($stat);
	} elsif ($cmd eq 'sready') {
        my ($key) = @args;
        my ($jid) = $key =~ /^(\d+)/;
        if ($io_wait{$jid} && $io_wait{$jid}->{zid}) {
            if (!($io_wait{$jid}->{type} eq 'io')) {
                return('quit');
            } else {
                return('ready');
            }
        }
        if (! -e "$conf{spool}/jpids/$jid" ) {
            xlog("debug", "Dumping stream, $jid is dead\n");
            delete $io_wait{$jid};
            return('quit');
        }
        if ( -s "$conf{spool}/jstat/$jid.stat" ) {
            if ( $conf{loop_num} > 4 ) {
                # any time you check mod times or abandon things, remember to ensure your loop num is more than some small number
                if (fmodtime("$conf{spool}/jstat/$jid.stat")<time()-$conf{ping_secs}*3) {
                    xlog("error", "Abandoning $jid streams, because no one wants them\n") if $trace;
                    delete $io_wait{$jid};
                    return('quit');
                }
            }
        }
        xlog("debug", "Not ready for $jid, deferring (creating io_hash entry)\n") if $trace;
        $io_wait{$jid}->{streams}->{$key}=$zid;
        $io_wait{$jid}->{time}=time();
        return ();
	} elsif ($cmd eq 'stream') {
        my ($key, $data) = @args;
        my ($jid) = ($key =~ /^(\d+)/);
        if ($io_wait{$jid} && $io_wait{$jid}->{type} eq 'io' && $io_wait{$jid}->{zid}) {
            replymsg($io_wait{$jid}->{zid},$key, $data);
        } else {
			xlog("debug", "Dumping stream $key, no wait $jid\n") if $trace;
            if ($data !~ /:end/) {
                burp("$conf{spool}/jstat/$jid.dumped",1) if length($data)>0;
            } else {
                touch("$conf{spool}/jstat/$jid.dumped");
            }
        }
        return ();
	} else {
		return ('error'=>"Unknown command $cmd ($src_data)");
	}

	return ();
}

sub jhistpath {
	my ($id) = @_;
	my $left = int($id/10000);
	my $right = $id;
	my $dir = "$conf{spool}/jhist/$left";
	mkdir($dir) if ! -d $dir;
	return "$dir/$right";
}

sub child_exit {
    my ($kid, $status) = @_;
    if ($pid_jobs{$kid}) {
        my $jid=$pid_jobs{$kid}->{jid};
        if ($jid) {
            if (-s "$conf{spool}/jstat/$jid.stat") {
                notifystat(unpack_file("$conf{spool}/jstat/$jid.stat"), 1);
                touch("$conf{spool}/jpids/$jid") if -e "$conf{spool}/jpids/$jid";
            }
        }
        delete $pid_jobs{$kid};
    }
}

sub schedule {
    my $did=0;

	return unless %nodes;

    while ((my $kid = waitpid(-1, WNOHANG))>1) {
        $did=1;
        if ($conf{services}->{exec}) {
            child_exit($kid, $?);
        }
   }

    my $tcpu;
	for my $n (values %nodes) {
		if (defined $n->{a_cpus}) {
			$n->{a_cpus} = $n->{a_memory} = 0;
		}
        $tcpu+=$n->{cpus};
	}

	# pass 1 : deduct resources for running jobs
    opendir(D,"$conf{spool}/jobs");
    my @D = sort {(.5-rand()) cmp (.5-rand())} readdir(D);
    closedir(D);

    if (($conf{loop_num}%100)==5) {
        # ocassionally check for expiration of start_waits, though it should never happen
        for my $jid (keys(%start_wait)) {
            if ($conf{expire_secs} && (time() > $start_wait{$jid}->{time}+$conf{expire_secs})) {
                my $jhistfile=jhistpath($jid);
                if ( -e $jhistfile ) {
                    $did=1;
                    my $job=unpack_file($jhistfile);
                    # possibly killed during messaging? in general this shouldn't happen anymore, so it's an error
			        xlog("error", "Job $jid, reply to jinfo after archive");
		            replymsg($start_wait{$jid}->{zid},jid=>$jid,status=>$job->{status}, error=>$job->{error}, hostname=>$job->{host}, ip=>"n/a");
                }
            }
        }
    }

    my $jcnt=0;
    my $jcpu=0;
    my $jnee=0;
    for my $jrun (@D) {
        ++$jnee unless ($jrun =~ /\.ip$/);
        next unless ($jrun =~ /\.run$/);
        --$jnee;
        
        my $job=read_job($jrun);
        my $job_ip=$1 if $jrun=~/\:(\S+)\.run$/;

		# this should be config, min requirements
		$job->{cpus} = 1 if $job->{cpus} == 0;
		$job->{memory} = ($conf{default_memory}*1000) if $job->{memory} == 0;
		for ((@metrics, @labels)) {
			$job->{$_} = $conf{"default-$_"} if !$job->{$_} && $conf{"default-$_"};
		}

		# job is running, make sure it's resources are somewhat locked
		my ($jid) = $jrun =~ m/^(\d+)/;

		# check to see whether job was ever started
		if ($conf{loop_num}>5 && (fmodtime($job->{file}) < (time()-$conf{ping_secs})) && ! -e "$conf{spool}/jobs/$jid.ok") {
			rename("$conf{spool}/jobs/$jrun","$conf{spool}/jobs/$jid");
			delete $nodes{$job_ip};
			xlog("error", "No execution confirm.  Deregister $job_ip as bad, put job $jid back in the queue");
        }
		# check to see whether job has expired
		if ($conf{loop_num}>5 && $conf{expire_secs} && (fmodtime($job->{file}) < (time()-$conf{expire_secs}))) {
            $did=1;
			selfcmd('jstat', {pid=>-1, id=>$jid, status => $STATUS_ORPHAN, error=>"Orphaned job (" . (time()-fmodtime($job->{file})) . "s)"});
		} else {
			$job_ip = slurp("$conf{spool}/jobs/$jid.ip") unless $job_ip;
			if ($job_ip =~ /\d/ && $nodes{$job_ip} && $nodes{$job_ip}->{ip}) {
                ++$jcnt;
		        $jcpu+=$job->{cpus};
				$nodes{$job_ip}->{a_cpus} += $job->{cpus};
				$nodes{$job_ip}->{a_memory} += $job->{memory};
				for (@metrics) {
					$nodes{$job_ip}->{"a_$_"} += $job->{param}->{$_};
				}
			}
		}
	}


    my @nlist = values %nodes;
	# no attempt here to prioritize jobs, just match and go
	my $cnt=0;
    my $full = ($jcpu/$tcpu);

    for my $jid (@D) {
		next unless ($jid =~ /^\d+$/o);

		my $job = read_job($jid);

		next if $job->{state} eq 'hold';

		$job->{priority} = $conf{default_priority} if !$job->{priority};
        next if ($job->{priority} < 20) && ($job->{priority}/5 < (sqrt(rand())*$full));

        $full+=($job->{priority}*$job->{cpus})/($tcpu*5);

		# this should be config, min requirements
		$job->{cpus} = 1 if $job->{cpus} == 0;
		$job->{memory} = ($conf{default_memory}*1000) if $job->{memory} == 0;

		my @dereg;
		my @n;
		my ($max_av, $max_n);

		if ($cnt >= $conf{max_sched}) {
			last;
		}
        if (!@nlist) {
			xlog("debug", "No available nodes, not scheduling");
            last;
        }

		++$cnt;
		my $spread = rand() > $conf{spread_pct}/100;

		for (my $i = 0; $i < @nlist; ++$i) {
            my $n = $nlist[$i];
            
			# jobs need enough memory, cpu availability and disk space... that's it
			if (!$n->{ip}) {
				xlog("error", "Node has no ip! " . packdump($n));
				next;
			}
			my $cpus = $n->{cpus} - $n->{a_cpus};
			$cpus = $n->{avail} if $n->{avail} < $cpus;

			my $mem = $n->{tmem} - $n->{a_memory};
			$mem = $n->{mem} if ($n->{tmem} == 0) || ($n->{mem} < $mem);

#           See below, only log reason
#            if (debugging()) {
#    			xlog("debug", "Sched $jid: $n->{ip}: jcpu:$job->{cpus}, norig: $n->{orig_cpus}, ncpu:$n->{cpus}, nall:$n->{a_cpus}, nav:$n->{avail}, cpus:$cpus , jmem:$job->{memory}, nmem:$n->{mem}, avmem:$mem");
#            }

            if ($cpus <= 0) {
                # pull the node out of the list
			    xlog("debug", "Removing $n->{hostname} from node list, cpus are $cpus");
                splice(@nlist, $i, 1);
                --$i;                
            }
            # did it ping recently?
			if ($n->{ping} > (time()-$conf{ping_secs}*6)) {
				my $ok = 1;
				for (@metrics) {
					if (($n->{"param-$_"} - $n->{"a_$_"}) < $job->{param}->{$_}) {
						$ok =0;
					}
				}
                for (@labels) {
                    if (!($n->{"param-$_"} eq $job->{param}->{$_})) {
                        $ok =0;
                    }
                }

				if ( ($mem >= $job->{memory}) &&
				     (($cpus+$conf{idle_load}) >= $job->{cpus}) && 
				     ($n->{disk} >= $job->{disk}) &&
					 ($ok)
				   ) {
					next if $job->{hosts} && $job->{hosts} !~ /$n->{hostname}/;

					my $match = 1;
					if ($conf{match}) {
						$match = evalctx($conf{match}, node=>$n, job=>$job);	# eval perl expression
						if ($@) {
							$match = 1;						# permit all on error?
						}
					}
					next unless $match;

					if ($spread) {
						# use *least* busy node
						if ($n->{load} < $conf{idle_load}) {
							# don't bother checking further, this node is bored
							$max_n = $n;
							last;
						} else {
							if ($n->{avail} > $max_av) {
								$max_n = $n;
								$max_av = $n->{avail};
							}
						}
					} else {
						# use *first* node
						$max_n = $n;
						$max_av = $n->{avail};
						last;
					}
				} else {
                    my $reason = "";
                    if (!($mem >= $job->{memory})) {
                        $reason = "memory ($mem)";
                    } elsif (!(($cpus+$conf{idle_load}) >= $job->{cpus})) {
                        $reason = "cpus ($cpus)";
                    } elsif (!(($n->{disk} >= $job->{disk}))) {
                        $reason = "disk ($n->{disk})";
                    } else {
                        for (@metrics) {
                            if (($n->{"param-$_"} - $n->{"a_$_"}) < $job->{param}->{$_}) {
                                $reason = "$_ (" . $n->{"a_$_"} . ")";
                            }
                        }
                    }
                    xlog("debug", "Sched $jid: $n->{ip}: $reason");
                }
			} else {
				push @dereg, $n->{ip} if $n->{ip};
			}
		}
		for my $ip (@dereg) {
			xlog("note", "Deregister node '$ip', last ping was " . (time()-$nodes{$ip}->{ping}) . " seconds ago");
			delete $nodes{$ip};
            $did=1;
		}
		if ($max_n) {
            $did=1;
			xlog("debug", "Matched '$max_n->{ip}' to job $job->{file}") if $job->{trace};

			# todo... change this... it's an ugly way of owning jobs
			my $jmine = "$job->{file}:" . $max_n->{ip} . ".run";
			touch($job->{file}) if -e $job->{file};
			rename($job->{file}, $jmine);
			if ( -e $jmine ) {
				my $jptr = "$job->{file}" . ".ip";
				burp($jptr, $max_n->{ip});
                noderun($max_n, $jid, $job);

                # TODO: handle metrics universally, allocated, total, and current
 
                $max_n->{a_cpus} += $job->{cpus};                   # allocate
                $max_n->{a_memory} += $job->{memory};
				for (@metrics) {
					$max_n->{"a_$_"} += $job->{param}->{$_};
				}
                $max_n->{avail} -= $job->{cpus};                    # assume being used
                $max_n->{mem} -= $job->{memory};                    # assume being used
                ++$jcnt;
            } else {
			    xlog("error", "Rename failed for $jmine\n");
            }
		} else {
			if ($conf{backup_grid}) {
				# TODO: fork exec to backup grid, and add to list of pids to track... as if you're an exec node
			}
			xlog("debug", "Can't find node for $jid, $jcnt jobs running did=$did\n") if $conf{trace};
		}
	}

	# kickstart nodes, if needed
	$did|=kicknodes();

	return $did;
}

sub read_job {
    my ($jid)=@_;
    my $jfil = "$conf{spool}/jobs/$jid";
    next unless -f $jfil;
    my $ref = unpack_file($jfil);
    if (!(ref($ref) eq 'HASH')) {
        xlog("error", "Invalid job file format ($ref): $jfil -> $conf{spool}/trash/$jid\n");
        rename($jfil, "$conf{spool}/trash/$jid");
    }
    $ref->{file}=$jfil;
    return $ref;
}

sub noderun {
	my ($n, $jid, $job) = @_;
	# send 'exec'
	$job->{port} = $conf{port};		# reply to me on this port
	$job->{id} = $jid;
	if ($start_wait{$jid}) {
		# info needed for status/stdio collection from execution node
		replymsg($start_wait{$jid}->{zid},jid=>$jid, ip=>$n->{ip}, port=>$n->{port}, hostname=>$n->{hostname});
        delete $start_wait{$jid};
	}
	sendcmd($n->{ip},$n->{port},'xexec', $job);
}

# called at start, and kill -HUP
sub init {
    $ENV{HOSTNAME} = `hostname`;
    chomp $ENV{HOSTNAME};
	readconf();
    $conf{version}=$VERSION;
	if ($daemon) {
		mkdir $conf{spool};
		mkdir "$conf{spool}/jobs";
		mkdir "$conf{spool}/jstat";
		mkdir "$conf{spool}/jhist";
		mkdir "$conf{spool}/nodes";
		mkdir "$conf{spool}/pids";
		mkdir "$conf{spool}/jpids";
		mkdir "$conf{spool}/trash";
		mkdir "$conf{spool}/guids";
		# reregister on reread
		delete $conf{node};
	}

	$conf{hostip} = host2ip($conf{hostname}) unless $conf{hostip};

    if (!$conf{hostip}) {
        die "Can't start server without knowing ip.   $conf{hostname} does not resolve to ip, and no hostip defined\n";
    }
}

sub getmem {
	my ($cache, $free, $tot);
	open F, "/proc/meminfo";
	while (<F>) {
		$tot = $1 if /MemTotal:\s*(\d+)/i;
		$free = $1 if /MemFree:\s*(\d+)/i;
		$cache = $1 if /Cached:\s*(\d+)/i;
		last if $cache & $free;
	}
	close F;
	return ($tot, $cache + $free);
}

sub getcpus {
	my $cores;
	open F, "/proc/cpuinfo";
	my %cores;
    while (<F>) {
            $cores{$1}=1 if /processor\s*:\s*(\d+)/i;
    }
	close F;
	$cores = scalar keys %cores if %cores;
	return $cores;
}

sub getbench {
	my ($force)=@_;
        my $bench = slurp("$conf{spool}/bench");
	if (!$bench||$force||(fmodtime("$conf{spool}/bench")<(time()-$conf{bench_secs}))) {
		my $s = Time::HiRes::time();
		my $i=0;
		while (Time::HiRes::time() < $s+3) {
			my %d = (1..10000);
			map {$d{$_}*=3.333} keys(%d);
			map {$d{$_}/=2.222} keys(%d);
			++$i;
		}
		my $e = Time::HiRes::time();
		$bench=$i/($e-$s);
		burp("$conf{spool}/bench",$bench);
	}
	return $bench;
}

sub slurp
{
    my $dat;
    my $in = new IO::File;
    return undef unless open($in, $_[0]);
    local $/ = undef;
    $dat = $in->getline;
    $in->close;
    close($in);
    return $dat;
}

sub srvexec {
    my $did=0;

	# assure we can't flood on misconfig
	$conf{ping_secs} = 5 if $conf{ping_secs} == 0;

	if ($conf{services}->{exec} && (!$conf{node} || (time() > ($conf{node}->{ping}+$conf{ping_secs}-1)))) {
		# ping master with stats
		$conf{node}->{arch} = `arch`; chomp $conf{node}->{arch};
		($conf{node}->{tmem},$conf{node}->{mem}) = getmem();					# free mem
		$conf{node}->{load} = slurp("/proc/loadavg");			# load
		$conf{node}->{orig_cpus} = getcpus();
		$conf{node}->{cpus} = $conf{cpus} ? $conf{cpus} : $conf{node}->{orig_cpus};	# num cores
		$conf{node}->{bench} = $conf{bench} ? $conf{bench} : getbench();	# num cores
		$conf{node}->{avail} = min($conf{node}->{cpus}, $conf{node}->{orig_cpus} - $conf{node}->{load});	
		$conf{node}->{ping} = time();
		$conf{node}->{port} = $conf{port};
		$conf{node}->{ip} = $conf{hostip};
		$conf{node}->{hostname} = $conf{hostname};
		$conf{node}->{kernel} = `uname -rv`; chomp $conf{node}->{kernel};
		$conf{node}->{arch} = `uname -m`; chomp $conf{node}->{arch};

		for ((@labels,@metrics)) {
			$conf{node}->{"param-$_"} = $conf{"param-$_"};
		}
	
        $did=1;	
		$conf{registered} = 1;
		if (!sendcmd($conf{master}, $conf{master_port}, 'node', $conf{node})) {
			$conf{registered} = 0;
		}
	}

    while ((my $kid = waitpid(-1, WNOHANG))>1) {
        $did=1;	
        child_exit($kid, $?);
    }

	if (time() > ($conf{lastpidtime}+$conf{ping_secs})) {
		# check for expiration of io_wait
		if ($conf{loop_num}>5) {
            for (keys(%io_wait)) {
                if ($conf{expire_secs} && (time() > $io_wait{$_}->{time}+$conf{expire_secs})) {
                    delete $io_wait{$_};
                }
            }
        }

        $did=1;	
		opendir(D,"$conf{spool}/jpids") or die "Can't open jpids\n";

        my $mjob;
        my $oksusp;
		while(my $jid = readdir(D)) {
			next unless $jid =~ /^\d/;
            next unless fmodtime("$conf{spool}/jpids/$jid") < time()-$conf{ping_secs};
            if (-s "$conf{spool}/jstat/$jid.stat") {
                notifystat(unpack_file("$conf{spool}/jstat/$jid.stat"), 1);
                next;
            }
            # been a while... check to see if it's alive
            my $job = unpack_file("$conf{spool}/jpids/$jid");
			my $pid = $job->{pid};

            # there could be a fake pid for a jobs that "ran" but for whatever reason, never started

            next unless $pid =~ /^\d+$/;

            if ($conf{node}->{avail} < -($conf{node}->{cpus}/2)) {
                if (!$mjob && ($job->{priority} < $mjob->{priority})) {
                    $mjob = $job;
                }
                ++$oksusp;
            }

            if ($conf{node}->{avail} > 0) {
                if ( -e "$conf{spool}/jstat/$job->{id}.held" ) {
                    kill(-18, $pid);
	                xlog("note", "Resuming $job->{id}, because node is available");
	                sendcmd($conf{master},$conf{master_port}, 'jedit', $mjob->{id}, state=>'');
                    unlink("$conf{spool}/jstat/$job->{id}.held");
                }
            }

    # wait for pids
            my $alive = kill(0, $pid);
            if ($alive) {
                $io_wait{$jid}->{time}=time() if ($io_wait{$jid});
                notifystat({id=>$jid, jrun=>1});
                touch("$conf{spool}/jpids/$jid") if -e "$conf{spool}/jpids/$jid";
            } else {
                notifystat({id=>$jid, status=>$STATUS_ORPHAN, error=>"Unknown $jid exit code, really bad!", dumped=>1});
            }
		}
		closedir D;

        if ($mjob && $oksusp > 1) {
            kill(-19, $mjob->{pid});
            touch("$conf{spool}/jstat/$mjob->{id}.held");
	        xlog("note", "Suspending $mjob->{id}, because node is busy");
	        sendcmd($conf{master},$conf{master_port}, 'jedit', $mjob->{id}, state=>'susp');
        }
        
		$conf{lastpidtime} = time();
	}
}

sub touch {
	my $nowisthe=time();
	return utime($nowisthe, $nowisthe, @_);
}

sub fmodtime {
	return (stat($_[0]))[9];
}

sub notifystat {
	my ($stat, $from_jstat, $nowait) = @_;

    confess("stat is required\n") if (!$stat);

    if (!$stat->{jrun}) {
        if ($io_wait{$stat->{id}} && $io_wait{$stat->{id}}->{zid}) {
            # tell the client that the job is done
            replymsg($io_wait{$stat->{id}}->{zid},"$stat->{id}:stat",$stat);
        } else {
            # it's not a request to notify that came from the file
            # and yet, the file is there, with, presumably, valid status info
            # so which do we report?   

            # I assume, report what's in the file....ie: it was there first
            # this can (rarely) happen if you kill a job after it completes successfully, for example

            # TODO: the safer thing is to report the "worst case" ... ie: if either is an error, report error
            # and if both agree ... then don't log anything here... not a problem

            if ( ! $from_jstat && -s "$conf{spool}/jstat/$stat->{id}.stat") {
                # log the problem for inspection
	            xlog("error", "Got alternative status for $stat->{id}, this may not be correct!");
                # save the new status as an error file to inspect later
                burp("$conf{spool}/jstat/$stat->{id}.stat-err", packfile($stat));
            }
        }
    }

    # already dack'ed
    if ( -e "$conf{spool}/jstat/$stat->{id}.dack" ) {
	    xlog("debug", "Not notifying status for $stat->{id}, dack already set");
        exec_clean($stat->{id});
        return;
    }

	xlog("debug", "Notifying status " . packdump($stat) . ".\n") if ($conf{trace} || $stat->{trace});

    # tell main queue about the status change
    $stat->{replyport}=$conf{port};
    if ($nowait) {
    	sendcmd_nowait($conf{master},$conf{master_port}, 'jstat', $stat);
    } else {
    	sendcmd($conf{master},$conf{master_port}, 'jstat', $stat);
    }
}

# unpack a very simple configuration-style file
sub funpack {
	my ($fil, $dat) = @_;
	return gunpack(slurp($fil), $dat);
}

sub gunpack {
	my ($msg, $dat) = @_;
	$dat = {} if !$dat;
	for (split(/\n/, $msg)) {
		my ($k, $v) = m/^\s*([^:=]+)?\s*[:=]\s*(.*?)\s*$/;
		$k = lc($k);
		$dat->{$k}=$v;
	}
	return $dat;
}

# more complex config file support
# contains logic for turning delimited lists into array configs, etc.
sub readconf {
	%conf = %def;

	_readconf("$conf{config}");

	# defines happen at the end so defaults can get unpacked
	for (keys %conf) {
		next if ref $conf{$_};
		if ($_ eq 'match' || $_ =~ /^label/) {
			# match rules are evaluated during matching, but reval now just to test
			my $test = $conf{$_};
			# see http://www.perlmonks.org/?node_id=685699 for why this is OK
			$test =~ s/`[^`]+`/1/g;			# turn off backtics
			$test =~ s/system\([^\)]\)/1/g;		# turn off system calls
			$safe->reval($test);			# check syntax
			if ($@) {
				# report a problem with the rule
				xlog("error", "Error testing match rule : $@");
			}
			$@='';
		} elsif ( ! ($conf{$_} =~ s/^\{(.*)\}$/eval($1)/gei) ) {
			# evaluate simple inline vars at configure-time
			if ( $conf{$_} =~ m/\$([\w-]+)\{/) {
                xlog("error", "Error, rule has a hash variable, which requires braces\n");
            } else {
    			$conf{$_} =~ s/\$([\w-]+)/$conf{lc($1)}?$conf{lc($1)}:$1/gei;
            }
		}
		if ($_=~ /^param-(.*)/) {
			my $nm=$1;
			# evaluates to the value for that param... if numeric is a "metric" otherwise is a "label"
			if ($conf{$_} =~ /^[\d.]+$/) {
				push @metrics, $nm;
			} else {
				push @labels, $nm;
			}
		}
	}

	# reorganize some conf vars into a hash
	for my $k (qw(services log_types)) {
		my $v;
		$v = $conf{$k};
		$conf{$k} = {};
		for (split(/[\s,]+/,$v)) {
			$conf{$k}->{$_} = 1;
		}
	}

    # these low-level entries are controlled by trace bits in packets... not by user preference
    $conf{log_types}->{trace}=1;

	# stored as an array reference
	$conf{env} = [split(/[\s,]+/,$conf{env})] unless ref($conf{env}) eq 'ARRAY';

	# split up host/port
	$conf{port} = $1 if $conf{bind} =~ s/:(\d+)$//;
	# same for master (if different - has to be if there's a queue/exec on the same box)
	$conf{master_port} = $1 if $conf{master} =~ s/:(\d+)$//;
	$conf{master_port} = $conf{port} if !$conf{master_port};
}

# basic config reader, like funpack, but uses the %conf and %def hashes
sub _readconf {
	my ($f) = @_;
	%conf = %def;
	if (!open(CONF, $f)) {
		xlog("error", "Can't open '$f'");
		die("Can't open config '$f'\n");
	}
	while(<CONF>) {
		next if /^\s*#/;
		my ($k, $v) = m/^\s*([^:]+)?\s*:\s*(.*?)\s*$/;
		$k = lc($k);
		if ($k eq 'include') {
			_readconf($v);
		} else {
			$conf{$k} = $v;
		}
	}
	close CONF;
}

# log stuff.   TODO: cache opened handles and test for operation... that way you won't have to reopen so many!
sub xlog {
    my $m = join("\t", @_);
	my $class = $_[0];
	return unless ref($conf{log_types}) && $conf{log_types}->{$class};
    $m =~ s/\n/ /g;
    my $line = scalar(localtime) . "\t" . $m . "\n";
    my $log = $conf{"log_file"};
    if ($log && ! ($log eq '-')) {
        open LOG, ">>" . $log;
        print LOG $line;
        close LOG;
        print STDERR $line if $log_to_stderr;
    } else {
        print $line;
    }
    return $line;
}

# wait for STDERR and STDOUT from a command
sub waitio {
    my ($host, $port, $jobid, $opt) = @_;
    my @resp;
    my $stat;
    my $err;
    my $diderr;

    my $sock = _sendcmd($host, $port, 'xio', $jobid);

    my $stat_time;
    $|=1;
    my $needio = !$opt->{err}+!$opt->{out};
    my $start_wait=time();
    my $unk;
    while ((!defined $stat) || $needio) {
        my $got = 0;
        # wait up to 5 seconds for output
        zmq_poll([{
        socket=>$sock, events=>ZMQ_POLLIN, callback=> sub {
        my ($ip, $trace, $key, $dat) = recvmsg($sock);
        if ($ip) {
            $got = 1;
            my ($jid, $type, $cmd) = split /:/, $key;
            if ($type eq 'err') {
                $diderr = 1 if $dat;
                print STDERR $dat if $dat;
                --$needio if $cmd eq 'end';
            } elsif($type eq 'out') {
                print STDOUT $dat if $dat;
                --$needio if $cmd eq 'end';
            } elsif($type eq 'stat') {
                $stat = $dat->{status};
                $err = $dat->{error};
                $stat_time=time() if !$stat_time;

                if (time()>($stat_time+$conf{expire_secs})) {
                    # i'm taking charge of dumping it
                    xlog("error", "Job $jobid, dumping i/o, and reporting failure... took too long");
                    $dat->{dumped}=1;
                }

                if ($dat->{dumped}) {
                    # don't wait longer for i/o if the i/o was lost
                    # todo... fail here if stat is 0?
                    if (!$stat) {
                        $stat=37;
                        $err="Failing because i/o was dumped";
                    }
                    $needio = 0;
                }
                # what does the client do with the times?
            } elsif($type eq 'unk') {
                ++$unk;
                sleep(1);
            } else {
                xlog("error", "Job $jobid, got message ($jid, $type, $cmd) in response to xio");
            }
        } else {
            xlog("error", "Job $jobid, got message ($key) in response to xio");
        }
        }}],$conf{retry_secs}*2*1000);

        if (!$got) {
            # if you didn't get anything, ask again
            $sock = _sendcmd($host, $port, 'xio', $jobid);
            if (time()>$start_wait+$conf{retry_secs}*7) {
                # been a while...ask head node if this job is dead?
                my %info = waitmsg($conf{master}, $conf{master_port}, 'jinfo', $jobid);
                if ($info{status} =~ /\d/) {
                    $stat=$info{status};
                    $err=$info{error};
                    $needio = 0;
                }
                if ($unk > 200) {
                    $stat=$STATUS_UNKNOWN;
                    $err="Error, job submission failed";
                    $needio = 0;
                }
                # restart wait time
                $start_wait=time();
            }
        }
    }
    return ($stat, $err, $diderr);
}

sub waitmsg {
    my $sock = _sendcmd(@_);
    my ($ip, $trace, @msg) = recvmsg($sock);
    return @msg;
}

# this tries over and over to get a response....
# usually this is not needed, but if the outer disappears, the zqm-id will be lost, so 
# this is just for recovery in the event of the router shutting down
sub waitmsg_retry {
    my $retry = shift @_;

    my $got=0;
    my ($ip, $trace, @msg);
    my $sock=_sendcmd(@_);
    while (!$got) {
        zmq_poll([
        {
                socket=>$sock, events=>ZMQ_POLLIN, callback=> sub {
                $got=1;
                ($ip, $trace, @msg) = recvmsg($sock);
        }},
        ],$retry);

        if (!$got){
            $sock=_sendcmd(@_);
        }
    }
    return @msg;
}

sub sendcmd_nowait {
    my ($host, $port, @cmd) = @_;
    return 0 unless my $sock = getsock($host, $port);
    my $xcmd = packcmd(@cmd);

    # 1 millisecond wait
#    zmq_poll([
#    {
#            socket=>$sock, events=>ZMQ_POLLOUT, callback=> sub {}
#    },
#    ],1000);

    if (zmq_send($sock, "", 0, ZMQ_SNDMORE|ZMQ_NOBLOCK)==0) {
        if (zmq_send($sock, $xcmd)==-1) {
            xlog("error","Can't send [@cmd] to $host:$port : $!", Carp::longmess());
            return 0;
        }
    } else {
        return 0;
    }
    return 1;
}

sub recvmsg {
    my ($sock) = @_;
    my @ret;
    my ($buf, $dat);
    if (my $msg = zmq_recvmsg($sock)) {
        $msg = zmq_recvmsg($sock);
        if ($msg) {
            my $buf = zmq_msg_data($msg);
            xlog("debug", "Client $$ got response: $buf") if $conf{trace};
            if ($buf) {
                return unpackcmd($buf);
            }
        }
    }
    return @ret;
}


sub sendcmd {
	my $sock = _sendcmd(@_);
	return $sock ? 1 : undef;  
}

sub packcmd {
    if (!$conf{hostip}) {
        confess("Need a defined hostip");
    }
    return encode_json([$conf{hostip},$conf{trace},@_]);
}

sub packref {
    if (!(ref($_[0]) eq 'ARRAY')) {
        croak "All packrefs are arrays";
    } else {
        return encode_json([$conf{hostip},$conf{trace},@{$_[0]}]);
    }
}

sub unpackcmd {
    my $ref = eval{decode_json($_[0])};
    if (!$@ && (ref($ref) eq 'ARRAY')) {
        return @$ref;
    } else {
        return undef;
    }
}

sub packfile {
    croak unless ref $_[0];
    if (ref $_[0] eq 'HASH') {
        $_[0]->{version}=$VERSION;
    }
    return encode_json($_[0]);
}

sub unpack_file {
    my $ref;
    eval {
        $ref=decode_json(slurp($_[0]));
    };
    carp "$@" if $@;
    return $ref; 
}

### sends a command to a server/router, returns the socket to wait on 
sub getsock {
	my ($host, $port) = @_;
    my $sock;
    if (!$ZMQS{"tcp://$host:$port"}) {
        $sock = zmq_socket($context, ZMQ_DEALER);
        if (!$daemon) {
            # clients should block if messages are queued, not bang on nonexistant servers
            zmq_setsockopt($sock, ZMQ_HWM, 50);
	        xlog("debug", "Set HWM to 50 for $host in pid $$\n");
        }
        if (!zmq_connect($sock,"tcp://$host:$port")) {
            $ZMQS{"tcp://$host:$port"} = $sock;
        } else {
            croak "Can't connect to tcp://$host:$port: $@\n";
        }
    } else {
        $sock = $ZMQS{"tcp://$host:$port"};
    }

	if (!$sock) {
		xlog("error",$@="Can't connect to $host:$port", Carp::longmess());
		return undef;
	}
    return $sock;
}

sub _sendcmd {
	my ($host, $port, @cmd) = @_;
    return undef unless my $sock = getsock($host, $port);
	my $xcmd = packcmd(@cmd);
    zmq_send($sock, "", 0, ZMQ_SNDMORE);
	if (zmq_send($sock, $xcmd)==-1) {
		xlog("error","Can't send [@cmd] to $host:$port : $!", Carp::longmess());
    }
    return $sock;
}

sub burp
{
        my ($f, $dat) = @_;
        my $h = new IO::File;
        do {
            eval {
                open ($h, ">$f.tmp") || die "$f: $!";
                print $h $dat;
                close $h;
                rename("$f.tmp", $f) || die "$f: $!";
            };
            if ($@) {   
                xlog("error",$!);
            }
        } while ($@);
}

sub usage {
	my $u;
	$u .= <<'EOF' unless $make;
Usage: grun <exec-options> command...
   or: grun -d [<local-daemon-options>]
   or: grun -k <jobid> [<jobid2...>]
   or: grun -e key=val[,key2=val2...] <jobid> [<jobid2...>]
   or: grun -q [<query-options>] <query-command>

Lightweight job queueing system

For more help, run grun -?, grun -d -?, grun -C -? or grun -q -?.
EOF

	$u .= <<'EOF' unless $daemon || $qinfo || $editjob || $make;

Execution Options:
    -f|ile FILE     Read FILE for job options (mem, cpu, cmd, etc)
    -m|em INT       memory minimum in MB
    -c|pu CPUS      minimum number of cpus 
    -host N1,N2     specify certain hosts
    -j|obid TEXT    user-supplied job ID
    -v|erbose       print job id, execution node and stats to STDERR
    -M|make         only run job if inputs are newer than outputs
    -r|priority INT  run job with only run job if inputs are newer than outputs

    -noio             disable io-processing, but wait for completion
    -nowait         no io and don't wait, just start the command
    -e|rr FILE      write stderr directly to FILE, no spool *
    -o|ut FILE      write stdout directly to FILE, no spool *

All options can be abbreviated to uniqueness.

* You can append error & output with -oa, -ea or -out-append -err-append, 
or both with -oea, or --out-err-append.

If the command contains shell metacharacters, it's wrapped in a bash script
EOF

	$u .= <<'EOF' if $make;
Make Semantics:

    %i:file    Input file, left in the command line
    #i:file    Input file, removed the command line before executing
    %o:file    Output file, left in the command line
    #o:file    Output file, removed the command line before executing
    < file     Input file, left in the command line
    > file     Output file, left in the command line
    %!>file    Output, but don't set as a dependency

For Example:
    grun "gzip %i:x.foo #o>x.foo.gz"

If a command fails under Make Semantics, the output file(s) will be 
set to FILE.failed.

NFS sync works better with make semantics.

EOF

	$u .= <<'EOF' if $qinfo;

Query Options:
    -a|ll           Query all nodes
    -n|odes         ($master) List of nodes to query

Query Commands:
    [-]status       List nodes (q)
    [-]jobs         List jobs (q)
    [-]history      List prior jobs (q)
    [-]conf         Dump config from memory (q,e)
EOF

	$u .= <<'EOF' if $daemon;

Daemon Options:
    -h|osts         (local) One or more hosts
    -r|eload        Reload config
    -k|ill          Kill running server
    -R|ESTART       Kill and restart a running server

Without an option, -d just starts the daemon on the local machine.
EOF
        $u .= <<'EOF' if $editjob;

Edit Keys:
    hold              Hold job (no value needed)
    resume            Resume job
    memory=N          Memory in MB
    cpus=N            # of Cpus needed
EOF

	$u .= <<'EOF' if !$make;

Common Options:
    -C FILE         (/etc/grun.conf) Config file location
    -t|race         Turn on debugging in the log file
    -V              Print version and exit
    -?              Show this help page
EOF

	$u .= <<'EOF' if defined($config) && $config eq '';

Configuration File:

All config variables written as {value} are interpreted as perl code, and get evaluated at startup.

The "include" varialbe actually just includes the file specified, as if it were part of the original file.

All non-code configuration variables can include '$varname', which gets expanded to the value of another config var.

Be careful with match code.  It it's slow, it will kill the performance of your main node.

Common variables:

    master            (localhost) Hostname[:port] of master node
    spool             (/var/spool/grun) Location for queue & io temp storage
    log_file          Location of the log
    services          Must be 'queue' and/or 'exec'
    port              Port to listen on (5184)
    bind[:port]       Address to bind to (0.0.0.0)
    trace             Turn tracing on for the whole server

Queue config vars:

    env               (PATH) List of environment varialbes to copy to the processes.  An asterisk (*) means 'ALL'
    expire_secs       (0) If set, jobs that aren't pinged in time get (failed or retried)
    expire_action     (retry) Can be 'retry', 'fail'
    idle_load         (.3) If load is less than this amount, then considered idle
    io_keep           (3600) Time to keep unretrieved stdio files (0=forever)
    log_file          Where to send "xlog" output
    pid_file          (/var/run/grun.pid)
    ping_secs         (30) Nodes ping the master this often.
    ping_expire       (2*$ping_secs) Drop a node if it doesn't ping in time

Cli vars & defaults:

    nfs_sync          (1) Whether to force-sync the directory cache after a job is run
    default_cpu       (1) Default cpu reservation
    default_memory    (1m) Default memory for jobs
    default_priority  (20) Default priority for jobs

Execution node config vars:

    match             Perl code that must eval to TRUE for a node match
    full_match        (1) If jobs queue is full, this is evaluated
    full_exec         If full match returns true, then this command is run
    wrap              Job command wrapper
    
EOF
   return $u;
}

sub showconf {
	return pretty_encode(\%conf);
}

sub showhist {
	my %opt;

	$opt{fmt} = '%jid\t%user\t%stat\t%cwd\t%cmd\t%host\t%mtime\n';

    {
            local @ARGV = @_;
            GetOptions(\%opt, "count|c=i", "user=s","job=i@","resubmit","fmt|F=s","long","grep|g=s","dump");
            @_=@ARGV;
    }

	if ($_[0] =~ /[a-z]/ && !$opt{user}) {
		$opt{user} = $_[0]
	} else {
        while ($_[0] =~ /^\d+$/) {
            push @{$opt{job}}, $_[0];
            shift;
        }
    }

    my $r;              # the result

	my $count = $opt{count}; 
	$count = 10 if !$count;
    my @J;
	if ($opt{job}) {
        for (@{$opt{job}}) {
            my $f = (jhistpath($_));
            if ( -e $f ) {
                my $t=unpack_file($f);
                my ($jid) = $f =~ /\/(.+)$/;
                $t->{jid}=$jid;
                my $mtime = (stat($f))[9];
                $t->{mtime}=$mtime;
                push @J, $t;
            } else {
                xlog("error", "History for job $_ requested, but file not found\n");
            }
        }
	} else {
        my $k = 5;
        my @mx = ((-1) x $k);
		opendir(D,"$conf{spool}/jhist");
		while(defined ($_=readdir(D))) {
			next unless /^\d+$/;
            for (my $i = 0; $i < $k; ++$i) {
    			if ($_ > $mx[$i]) {
                    # shift everything up
                    my $top=$i++;
                    for (; $i < $k; ++$i) {
                        $mx[$i]=$mx[$i-1];
                    }
                    $mx[$top]=$_;
                    last;
	    		}
            }
		}
		closedir(D);

        for (my $i = 0; $i < $k; ++$i) {
            if (@J < $count) {
                opendir(D,"$conf{spool}/jhist/$mx[$i]");
                my @T = readdir(D);
                @T = sort {$b <=> $a} @T;
                closedir(D);
                # prepend dir
                for my $jid (@T) {
                    next unless $jid=~/^\d/;
                    my $f = "$conf{spool}/jhist/$mx[$i]/$jid";
                    my $job=eval{unpack_file($f)};
                    next unless ref($job);

                    # support array/text formats
                    my @cmd=ref $job->{cmd} ? @{$job->{cmd}} : ($job->{cmd});

                    # user supplied filter
                    next if $opt{user} && ! ($job->{user} eq $opt{user});
                    next if $opt{grep} && "$job->{user}\t@cmd\t$job->{cwd}" !~ $opt{grep};

                    my $mtime = (stat($f))[9];

                    next unless $mtime;

                    $job->{jid}=$jid;
                    $job->{mtime}=$mtime;

                    push @J, $job;
                    last if @J >= $count;
                }
            }
        }
	}

    for my $job (@J) {
		my $jid=$job->{jid};
		my $mtime = $job->{mtime};
       
        my @cmd=ref $job->{cmd} ? @{$job->{cmd}} : ($job->{cmd});
        next if $opt{user} && ! ($job->{user} eq $opt{user});
        next if $opt{grep} && "$job->{user}\t@cmd\t$job->{cwd}" !~ $opt{grep};
        --$count;
        my %job=%{$job};

        $job{status} = $job->{usage}->{status} if $job->{usage} && ! $job->{status};
        $job{status} = ($job{status} >> 8) if 0 == ($job{status} & 0xFF);
        # standard
        $job{status} = 'OK' 		if $job{status} eq 0;
        $job{status} = 'KILLED' 	if $job{status} eq 199;
        $job{status} = 'INT' 		if $job{status} eq 2;
        $job{status} = 'ORPHAN' 	if $job{status} eq $STATUS_ORPHAN;
        $job{status} = 'SIGSEGV'	if $job{status} eq 11;
        $job{status} = 'SIGPIPE'	if $job{status} eq 13;
        $job{status} = 'SIGFPE'		if $job{status} eq 8;

        # linux specific
        $job{status} = 'NOTFOUND' 	if $job{status} eq 127;
        $job{status} = 'ASSERT'		if $job{status} eq 134;
        $job{status} = 'OUTOFMEM'	if $job{status} eq 137;
        $job{status} = 'ALTSEGV' 	if $job{status} eq 139;
        $job{stat} = $job{status};

        $job{wait} = $job{usage}->{start_time}-$job{time};

        my $cmd = join(' ', @cmd);
        $cmd =~ s/\n\s+/\n/g;
        $cmd =~ s/^\s+//;
        $cmd =~ s/\s+$//;
        $cmd =~ s/\n/;/g;
        if ($nodes{$job{host}} && $nodes{$job{host}}->{hostname}) {
            $job{host}=$nodes{$job{host}}->{hostname} 
        } elsif (-e (my $n="$conf{spool}/nodes/$job{host}.reg")) {
            my $node=unpack_file($n);
            $job{host}=$node->{hostname};
        }
        $job{jid}=$jid;
        $job{cmd}=$cmd;
#			$job{env}=join ':' . map { $_.'='.$job{env}{$_} } keys %{$job{env}};
        $job{mtime}=fmtime($mtime);
        if (ref($job{usage})) {
            for (keys(%{$job{usage}})) {
                $job{"usage_".$_}=$job{usage}{$_};
            }
            delete $job{usage};
        }
        if ($opt{long}) {
            $r .= "----------\n";
            for(sort(keys(%job))) {
                $r .= "$_=" . (ref($job{$_})?packdump($job{$_}):$job{$_}) . "\n";
            }
        } elsif ($opt{dump}) {
            $r .= packdump($job) . "\n";
        } else {
            $r .= fmtstr($opt{fmt}, \%job);
        }
        
		last if $count == 0;
    }
    return $r;
}

sub fmtime {
	my ($t) = @_;
	return strftime("%m/%d %H:%M",localtime($t));
}

# grun -q status
sub shownodes {
        my $r;
        $r .= sprintf "%-15s  %-14s %-8s Bench\n", 'Hostname','Memory', 'Cpu';
	my @nodes = getnodes();
        for my $node (sort {$a->{hostname} cmp $b->{hostname}} @nodes) {
                if ($node->{ping} > time() - ($conf{ping_secs} * 2) ) {
                        chomp($node->{hostname});
                        $node->{hostname} = substr($node->{hostname},0,15);

			my $cpus = $node->{cpus} - $node->{a_cpus};
			$cpus = $node->{avail} if $node->{avail} < $cpus;

			my $mem = $node->{tmem} - $node->{a_memory};
			$mem = $node->{mem} if $node->{tmem} > 0 && $node->{mem} < $mem;

	                $r .= sprintf "%-15s %6dm/%-6d %4.1f/%-2d %3d\n", $node->{hostname}, $node->{mem}/1000, $node->{tmem}/1000,$node->{avail},$node->{cpus},$node->{bench};
                }
        }
        return $r;
}

# grun -q jobs
sub showjobs {
	my %opt;

	$opt{fmt} = '%s:jid\t%s:user\t%s:stat\t%s:cwd\t%s:cmd\n';
	{
		local @ARGV = @_;
		GetOptions(\%opt, "dump|d", "user=s","job=i","fmt|F=s","long","debug");
		@_=@ARGV;
	}

	if ($_[0]) {
  	    my $user = shift @_;	
		if ($user =~ /^\d+$/) {
			$opt{job}=$user;
		} else {
			$opt{user}=$user if !($user eq '');
		}
	}

	xlog("debug", "Show job for user:$opt{user}, job:$opt{job}\n");

        my $r;
#        $r .= sprintf "%s\t%s\t%s\t%s\t%s\n", 'JobID','User','Host','Cwd','Command';

        my $now = time();
        opendir(D,"$conf{spool}/jobs");
        while(my $jid=readdir(D)) {
		next if $jid =~ /\.ip$/;
		next if $jid =~ /\.ok$/;
        my $f = "$conf{spool}/jobs/$jid";
        next unless -f $f;
        my $job = unpack_file($f);
		if (ref($job)) {
			my %job=%{$job};
			next if $opt{user} && ! ($opt{user} eq $job{user});
			next if $opt{job} && ! ($jid =~ /^$opt{job}\b/);
			my $stat = '(I)';
			$stat = '(H)' if $job->{state} eq 'hold';
			if ($jid =~ s/:([.\d]+?)\.run$//) {
				my $ip = $1;
				my $node = unpack_file("$conf{spool}/nodes/$ip.reg");
                $stat = $node->{hostname}; chomp $stat;
                $job->{host} = $stat;
                # ip file is created at start
                # run file is updated regularly to prevent orphans
                if ( -e "$conf{spool}/jobs/$jid.ip" ) {
    			    $job->{start} = (stat("$conf{spool}/jobs/$jid.ip"))[9];
                }
			    $stat .= ' (S)' if $job->{state} eq 'susp';
			} else {
			    $stat = '(S)' if $job->{state} eq 'susp';
            }
			# trim for display
            my @cmd = @{$job->{cmd}};
			for (@cmd) {
				s/^\s+//g;
				s/\n\s*/;/g;
			}
			$job->{wait} = ($job->{start}?$job->{start}:$now)-$job->{time};
			$job->{jid} = $jid;
			$job->{stat} = $stat;
			$job->{cpus} = 1 if ! $job->{cpus};
			$job->{memory} = $conf{default_memory} if ! $job->{memory};
			$job->{priority} = $conf{default_priority} if ! $job->{priority};
			$job->{cmd} = join(' ', @cmd);
			if ($opt{long}) {
                $r .= "----------\n";
                for(sort(keys(%$job))) {
                    $r .= "$_=" . (ref($job->{$_})?packdump($job->{$_}):$job->{$_}) . "\n";
                }
			} elsif ($opt{dump}) {
				$r .= packdump($job) . "\n";
			} else {
				$r .= fmtstr($opt{fmt}, $job);
			}
		}
        }
        closedir(D);
        return $r;
}

sub fmtstr {
    my ($fmt, $hash) = @_;

# get list of fields
    my @fds = $fmt =~ m/%(?:[#0 +,I:-]*(?:\d+)?(?:\.\d+)?\w{1,2}:)?([\w-]+|(?:{[^{}]+}))/g;

	%{$safe->varglob("i")}=%{$hash};
	my @vals;
	for (@fds) {
		if ($_ =~ /^\{(.+)\}/) {
			push @vals, $safe->reval($1);
		} else {
            if (ref($hash->{$_}) eq 'HASH') {
    			push @vals, packdump($hash->{$_});
            } else {
    			push @vals, $hash->{$_};
            }
		}
	}
	undef %{$safe->varglob("i")};

	# replace formatted - with format-only
        $fmt           =~   s/(%[#0 +,I:-]*(?:\d+)?(?:\.\d+)?\w{1,2}):([\w-]+|({[^{}]+}))/$1/g;

	my $fds=join '|', map quotemeta($_), @fds;
	# replace pure-fields with field-only
        $fmt =~ s/%($fds)/%s/g;
	# expand vars

        $fmt =~ s/\\n/\n/g;
        $fmt =~ s/\\t/\t/g;
        $fmt =~ s/\\r/\r/g;

	# format the rest
        return sprintf($fmt, @vals);
}

sub bestunique {
	my ($c, @c)  = @_;
	my $b;
	for (@c) {
		if ($_ =~ /^$c/) {
			return undef if ($b);
			$b = $_;
		}
	}
	return $b;
}

# returns an array of [host,port], given a list of host[:port] names
sub expandnodes {
	my @r;
	my @n;
	for (split(/[\s,]/, join ' ', @_)) {
		my ($h, $p) =  m/^(.*)(:\d+)?$/;
		$p = $conf{port} if !$p;
		if ($h =~ s/\*/\.\*/g) {
			if (!@n) {
				@n=getnodes();	
			}
			for (@n) {
				push @r, [$_->{hostname}, $p] if $_->{hostname} =~ /$h/;
			}
		} else {
			push @r, [$h, $p];
		}
	}
	return @r;
}

sub evalctx {
	my ($expr, @ctx) = @_;
	my $msafe = new Safe;
	$msafe->permit(qw(:subprocess :filesys_read));        # allow backticks
	for(my $i = 0; $i < @ctx; $i+=2) {
		my ($name, $var) = ($ctx[$i], $ctx[$i+1]);
		# references to hashes/arrays become dereferenced hashes/arrays
		if (ref($var) eq 'HASH') {
			%{$msafe->varglob($name)}=%{$var};
		} elsif (ref($var) eq 'ARRAY') {
			@{$msafe->varglob($name)}=@{$var};
		} else {
			${$msafe->varglob($name)}=$var;
		}
	}
	$msafe->share(qw(%conf %ENV));
#	if ($conf{trace}) {
#		xlog("debug", "Evaluating {$expr}\n");
#	}
	my ($res, @res);
	if (wantarray) {
		@res = $msafe->reval($expr);
	} else {
		$res = $msafe->reval($expr);
	}
	my $save = $@;
	if ($@) {
		xlog("error", "Error evaluating {$expr} : $@\n");
		return undef;
	}
	$@=$save;
	if (wantarray) {
		return @res;
	} else {
		return $res;
	}
}

sub kicknodes {
    my $did=0;
    my @nodes = getnodes(cached=>1);
	return if !$conf{kickstart};
    return if (time() < ($conf{lastkicktime} + $conf{ping_secs} * 4));
	$conf{lastkicktime} = time();
	for my $node (@nodes) {
		# not just started
		if ($conf{loop_num} > 5) {
		# sometime in the last 24 hours?
		if ($node->{ping} > (time() - ($conf{remove_secs}))) {
			# but not in the last couple minutes
			if ($node->{ping} < (time() - ($conf{ping_secs} * 4)) ) {
				# kick it
                $did=1;
				xlog("note", "Kicking node $node->{hostname}\n");
				if (!fork) {
                    eval {
                        zmq_fork_undef();
                        # be sure this is gone
                        my $cmd = $conf{kickstart};
                        if ($cmd =~ /^\{(.*)\}$/) {
                            $cmd = evalctx($1, node=>$node);
                        }
                        if ($cmd && $cmd !~ /\$/) {
                            exec("$cmd");	
                        }
                    };
                    exit(0);
				}
			}
		}
        }
	}
    return $did;
}

sub getnodes {
	my (%opt) = @_;
	my @r;
	if (!$opt{cached} && %nodes) {
		# return memcached values
		return values %nodes;
	}
	# read all from disk, including old ones
    opendir(D,"$conf{spool}/nodes");
    while($_=readdir(D)) {
        $_ = "$conf{spool}/nodes/$_";
        next unless -f $_;
        my $node=eval{unpack_file($_)};
        if ($node) {
            if (! defined $node->{avail}) {
                $node->{avail} = $node->{cpus}-$node->{load};
            }
            push @r, $node;
        }
	}
	closedir D;
	return @r;
}

sub startdaemon {
	my ($dkill, $restart, $reload, $node, $all, $dobench, $nofork);

	GetOptions("kill"=>\$dkill, "restart|R"=>\$restart, "benchmark|B"=>\$dobench, "reload|r"=>\$reload, "host=s"=>\$node, "all"=>\$all, "nofork|F"=>\$nofork) ||
		die usage();

	$node = '*' if $all;

	if ($reload || $node) {
		my @n = $node ? expandnodes($node) : ([$conf{master},$conf{master_port}]);
	
		my $stat = 0;
		my $xcmd = $reload ? 'relo' : $restart ? 'rest' : $dkill ? 'term' : '';
		if (!$xcmd) { 
			die usage();
		}
		for (@n) { 
			my ($res) = waitmsg($_->[0], $_->[1], "xcmd", $xcmd);
			if (! defined $res) {
				print "$@\n";
				$stat = 1;
			} else {
				print "$res\n";
			}
		}
		exit $stat;
	}

	if ($dobench) {
		print STDERR "Bench\t" . int(getbench(1)) . "\n";
		exit 0;
	}

	if ($restart) {
		$daemon = 1;
	}

	killgpid() if $dkill || $restart;
	exit 0 if $dkill;
	sleep 1 if $restart;

	# start daemon
	if ($gpid) {                                             # already running?
		if (kill(0, $gpid)) {
			die "Already running ($gpid), not starting twice\n";
		}
	}
	xlog("note", "Starting daemon as " . getpwuid($<));

    my $limit = `bash -c "ulimit -v -f -t -m -s"`;

    if (@{[($limit =~ m/unlimited$/mg)]}!=5) {
        my @n=qw(virt file time mem stack);
        my @v=$limit =~ m/(\w+)$/mg;
        my %d;
        map {$d{$n[$_]}=$v[$_] if ! ($v[$_] eq 'unlimited')} (0..4);
        warn "Note: ulimit is not unlimited for daemon (" . join(' ',%d) . ")\n";
    }

    if ($conf{ulimit}) {
        if (eval {require BSD::Resource;}) {
            local @ARGV = split / /, $conf{ulimit};
	        Getopt::Long::Configure qw(no_require_order no_ignore_case);
            my %opt;
            GetOptions(\%opt,"f=i","v=i","m=i","t=i");
            if (@ARGV) {
                die "Options @ARGV not supported by grun, use a regular ulimit wrapper";
            }
            no strict "subs";
            BSD::Resource::setrlimit(BSD::Resource::RLIMIT_FSIZE(),$opt{f},$opt{f}) if ($opt{f});
            BSD::Resource::setrlimit(BSD::Resource::RLIMIT_VMEM(),$opt{v},$opt{v}) if ($opt{v});
            BSD::Resource::setrlimit(BSD::Resource::RLIMIT_RSS(),$opt{m},$opt{m}) if ($opt{m});
            BSD::Resource::setrlimit(BSD::Resource::RLIMIT_CPU(),$opt{t},$opt{t}) if ($opt{t});
        } else {
            die "Please install BSD::Resource to use the 'ulimit' option in $def{config}\n"; 
        }
    }

    zmq_safe_term();

	if ($nofork || (!($gpid=fork))) {
        zmq_fork_undef() if !$nofork;

        $log_to_stderr = 1 if $nofork;
        $gpid = $$ if $nofork;
		die "Can't fork child process\n" if (! defined $gpid);

        $context=zmq_init();
		open (P, ">$conf{pid_file}") || die "Can't open pidfile '$conf{pid_file}' : $!\n";
		print P $$;
		close P;                                        # save pid

        if (!$nofork) {
            open STDIN,  '</dev/null';
            if (! -t STDOUT || ( $conf{log_file} && ! ($conf{log_file} eq '-'))) {
                open STDOUT, '>/dev/null';
                open STDERR, '>&STDOUT';
            }
        }

		POSIX::setsid();

		$SIG{INT}  = sub { $quit = 1; };
		$SIG{TERM} = $SIG{INT};
		$SIG{HUP}  = \&init;

        $router = zmq_socket($context, ZMQ_ROUTER);

        zmq_bind($router, "tcp://$conf{bind}:$conf{port}")
		  and die "Can't make ZMQ router on port $conf{port}: $!";

		$quit = 0;
		$conf{loop_num} = 0;
		while (!$quit) {
            my $did=0;
            my $start = Time::HiRes::time();
			++$conf{loop_num};
			eval {$did|=readsocks()};
			last if $quit;
			xlog("error", "Daemon $$ readsocks exception: $@") if $@;

			# theoretically these could be in separate threads, or forked off
			if ( $conf{services}->{queue} ) {
				eval {$did|=schedule()};
				xlog("error", "Daemon $$ schedule exception: $@") if $@;
			}

			if ( $conf{services}->{exec} ) {
				eval {$did|=srvexec()};
				xlog("error", "Daemon $$ srvexec exception: $@") if $@;
			};

            if (!$did) {
                my $elapsed = Time::HiRes::time()-$start;
                if ($elapsed < .25) {
                    # fractional sleep
#		            xlog("debug", "Did nothing, fractional sleep for " . (.25-$elapsed));
                    select(undef, undef, undef, .25-$elapsed);
                }
            }
#            print "HERE\n";
		}
		xlog("note", "Shutdown");
        zmq_unbind($router, "tcp://$conf{bind}:$conf{port}");
        sleep(1);
        eval {readsocks()};
		zmq_safe_term();
		unlink $conf{pid_file};
	}
	exit 0;
}

sub END {
    zmq_safe_term();
}

sub zmq_safe_term() {
    # if i'm the parent pid
    if ($router) {
        zmq_setsockopt($router, ZMQ_LINGER, 1);
        zmq_unbind($router,"tcp://$conf{bind}:$conf{port}");
        zmq_close($router);
        $router=undef;
    }
    # these are all the execution nodes... if any
    for (values(%ZMQS)) {
        zmq_setsockopt($_, ZMQ_LINGER, 1);
        zmq_close($_);
    }
    %ZMQS=();
    if ($context) {
        zmq_term($context);
        $context=undef;
    }
}

sub zmq_fork_undef() {
   # close all duped file descriptors
    opendir(D,"/proc/$$/fd");
    while(my $fd = readdir(D)) {
        if ($fd > 2 && (POSIX::lseek($fd, 0, POSIX::SEEK_CUR) == -1)) {
            if (POSIX::ESPIPE == POSIX::errno()) {
                POSIX::close($fd) if $fd > 2;
            }
        }
    }
    closedir(D);
    # kill access to any forked sockets
    $router=undef;
    %ZMQS=();
    $context=undef;
}


sub killgpid {
	die "Can't find pid $conf{pid_file} for daemon\n" if !$gpid;
	if (!kill(2, $gpid)) {
		die "Can't kill -INT $gpid: $!\n";
	}
	sleep 1;
	$gpid = 0;
}

sub samehost {
	my($h1, $h2) = @_;
    $h1 =~ s/\s+$//;
    $h2 =~ s/\s+$//;
	$h1=host2ip($h1);
	$h2=host2ip($h2);
	# localhost = dig `hostname`
	$h1 =~ s/^(0\.0\.0\.0|127\.0\.0\.1)$/$conf{hostip}/;
	$h2 =~ s/^(0\.0\.0\.0|127\.0\.0\.1)$/$conf{hostip}/;
	return $h1 eq $h2;
}

sub host2ip {
  my (@octets, $raw_addr, $ip);
  return $_[0] if $_[0] =~ /^(\d+\.){3}\d+$/;
  $raw_addr = (gethostbyname($_[0]))[4];
  @octets = unpack("C4", $raw_addr);
  $ip = join(".", @octets);
  return($ip);
}

sub packdump {
    return encode_json($_[0]);
}

sub kill_job {
	my %op = @_;
    my ($err) = waitmsg($conf{master}, $conf{master_port}, 'jkill', \%op);
    if ($err =~ /Forward (\d+):?\s*([\d.]+):?(\d*)/i) {
        my ($jid, $ip, $port) = ($1, $2, $3);
        $port = $conf{port} if !$port;
        ($err) = waitmsg($ip, $port, 'xabort', $jid, $op{sig}, $op{termio});
    }
	return $err;
}

sub proper {
	my $x = shift;
	$x=~ s/\b(\S)/uc($1)/eg;
	return $x;
}

sub execute_job {
    my ($opts) = @_;
    my @cmd = @{$opts->{cmd}};

    my ($uid, $gid, $err, $ret);
    if ($opts->{user}) {
        (undef, undef, $uid, $gid) = getpwnam($opts->{user});
    }
    if (defined($uid) && !$conf{run_asroot} && !$uid) {
        $err="Won't run as root";
    }
    if (!defined($uid)) {
        $err="User $opts->{user} is unknown on this machine, not executing";
    }
    if ($opts->{group}) {
        $gid=$opts->{group};
    }
# expecting someone to come pick up output?
    xlog("Creating io_wait for $opts->{id}") if $opts->{wait} || $opts->{io};
    $io_wait{$opts->{id}}->{type} = 'stat' if $opts->{wait};	# status wait
    $io_wait{$opts->{id}}->{type} = 'io' if $opts->{io};		# io wait
    $io_wait{$opts->{id}}->{time} = time() if $opts->{wait} || $opts->{io};		# io wait

    my $pid;

    if ($conf{wrap}) {
        @cmd = ($conf{wrap}, @cmd);
    }

    my $pfile = "$conf{spool}/jpids/$opts->{id}";

    if (-e $pfile) {
        xlog($err="Job file $pfile already exists, not executing.");
    } else {
        if (!open(PF, ">$pfile")) {
            xlog($err="Can't create $pfile: $!");
        }
    }

    my $bfile = "$conf{spool}/jstat/$opts->{id}";
    my $tfile = "$bfile.stat";
    my $ifile = "$bfile.job";

    $opts->{uid}=$uid;
    $opts->{gid}=$gid;

    # job info, saved
    burp($ifile, packfile($opts));
    chown($uid,0,$ifile);

    if (!open(XOUT, ">", "$tfile")) {
        $err=$!;
        $ret=$?;
    }
    chown($uid,0,$tfile);
    # leave the file open for fork

    if (!$err && !($pid=fork)) {
        if (! defined $pid) {
            $err = "Can't fork";
        } else {
            $0="GRUN:$opts->{id}";
            zmq_fork_undef();

        # restore signal to default ... regular kill
            $SIG{INT} = undef;
            $SIG{TERM} = undef;
            $ENV{USER} = $opts->{user};

        # kill me with a negative number, all kids die too
        # kill my parent.... i stay alive, and my IO is still ready to go
            my $pgid=POSIX::setsid();
            xlog("debug", "PGID set to $pgid\n");

        # copy in the umask & the environment
            umask $opts->{umask};
            for (keys(%{$opts->{env}})) {
                $ENV{$_}=$opts->{env}->{$_};
            }

            for (keys(%{$opts})) {
                next if ref($opts->{$_});
                $ENV{"_GRUN_OPT_$_"} = $opts->{$_};
            }
            $ENV{"_GRUN"} = $opts->{id};
            $ENV{"SGE_TASK_ID"} = $opts->{id} if (!$ENV{SGE_TASK_ID});

            my ($err, $ret);

            my $shfile = "$bfile.sh";
            if (!open(SHFILE, ">", $shfile)) {
                $err=$!;
                $ret=$?;
            }
            chown($uid,0,$shfile);

            my $hard_factor = defined($opts->{hard_factor}) ? $opts->{hard_factor} : $conf{hard_factor};
            if ($hard_factor && $opts->{memory}) {eval{
                # add 4mb for o/s stuff to load
                print SHFILE "ulimit -v " . (4000+int(($opts->{memory} * $conf{hard_factor}))) . "\n" 
                    if $opts->{memory};
            }}

            if ($cmd[0] !~ / /) {
                for (@cmd) {
                    if ($_ =~ / /) {
                        $_ =~ s/"/\\"/g;
                        $_ = '"' . $_ . '"';
                    }
                }
            }

            print SHFILE join(" ", @cmd), "\n";
            close SHFILE;

            xlog("debug", "Wrote $shfile\n") if $opts->{trace};

            if (!$err) {
                xlog("debug", "Setting uid to $uid, gid to $gid.\n") if $opts->{trace};
                eval {	
                    if ($gid) {
                        $) = $gid;
                        $( = $gid;
                    }
                    $> = $uid;
                    $< = $uid;
                    if ($opts->{cwd}) {
                        if (!chdir($opts->{cwd})) {
                            $err = "Can't cd to $opts->{cwd} : $!";
                            $ret = 103;
                        }
                    }
                };
            }

            if (!$err && $@) {
                $ret = 102;
                $err = "Error setting uid to $uid: $@\n";
            }

            xlog("debug", "About to launch (@cmd)\n") if $opts->{trace};

            if (!$err) {
                my $confarg = "-C $conf{config}";                # same config!
                # this frees all ram... yay, finally
                exec("$GRUN_PATH $confarg -Y $bfile");
                $err = "Can't exec: $!";
                $pid=-1;
            } else {
                eval {
                    # save output

                    # immediate reply if possible
                    xlog("debug", "Error before launch: $ret \"$err\" (@cmd)\n") if $opts->{trace};
                    my $out = {
                        id=>$opts->{id},
                        status=>$ret,
                        error=>$err,
                        dumped=>1,
                    };
                    print XOUT packfile($out);
                    close XOUT;
                    $context=zmq_init();
                    send_status_for_job($out);
                    printf STDERR "$err\n";	
                };
                if ($@) {
                    xlog("error", "Error reporting error : $@\n");
                }
                exit $ret;
            } 
        }
    }

# fake pid
    if ($err) {
        $ret = $STATUS_EXECERR if !$ret; 
        $pid = 'e' . $opts->{id};
        xlog("note", "Error '$err' with job $opts->{id}, pid '$pid'");
        my $out = {
            id=>$opts->{id},
            status=>$ret,
            error=>$err,
            dumped=>1,
        };
        print XOUT packfile($out);
        close XOUT;

        xlog("debug", "FILE: $tfile");

        notifystat($out, 1, 1);
        # special case... exec failed
        exit 1 if $pid == -1;
    } else {
        xlog("note", "Started job $opts->{id}, pid $pid, '@cmd'") if !$err;
        close XOUT;
        # record pid for wait
        $pid_jobs{$pid}->{jid}=$opts->{id};
        $pid_jobs{$pid}->{time}=time();
    }

    $opts->{pid} = $pid;
    $opts->{err} = $err;
    print PF packfile($opts);
    close PF;

    # ok we've officially either a) started this job or b) notified of failure at this point....
    sendcmd_nowait($conf{master}, $conf{port}, 'jexok', { map { $_ => $opts->{$_} } qw/id pid uid gid err/ });
}

sub send_status_for_job {
    my ($stat, %op) = @_;
# this could be in a fork... be safe
    my $sock = zmq_socket($context, ZMQ_DEALER);
	xlog("note", "Sending status for $stat->{id} as $stat->{status}\n") if $conf{trace};
    zmq_connect($sock,"tcp://$conf{bind}:$conf{port}");
    zmq_send($sock, "", 0, ZMQ_SNDMORE);
    zmq_send($sock, packcmd("sstat", $stat));  
    zmq_close($sock);
    zmq_term($context);
}

sub jid_from_opts {
    my ($job) =@_;
    return $job->{jid} ? $job->{jid} : jid_from_guid($job->{guid});
}

sub jid_from_guid {
    my ($guid) = @_;
    if (-s "$conf{spool}/guids/$guid") {
        return slurp("$conf{spool}/guids/$guid");
    }
}

sub create_guid {
    substr(Data::UUID->new()->create_hex(),2)
}

sub exec_clean {
    my ($jid) = @_;
    if (!-e "$conf{spool}/jpids/$jid") {
        xlog("debug", "Clean $jid which is already gone");
    } else {
        xlog("debug", "Cleaning $jid") if $conf{trace};
        # stop tracking this pid, head node got all the info
        my $job = unpack_file("$conf{spool}/jpids/$jid");
        # nobody asked for the job info, so don't keep it around after exec node knows about it
        unlink("$conf{spool}/jpids/$jid");
        unlink("$conf{spool}/jstat/$jid.stat");
        unlink("$conf{spool}/jstat/$jid.held");
        unlink("$conf{spool}/jstat/$jid.stat-err");
        unlink("$conf{spool}/jstat/$jid.sh");
        unlink("$conf{spool}/jstat/$jid.job");
        unlink("$conf{spool}/jstat/$jid.dack");
        unlink("$conf{spool}/jstat/$jid.dumped");
    }
}

sub stream_sigh {
    xlog("debug", "stream $$ SIG $_[0]");

    # set the quit flag
    $stream_quit = 1;
    close STDIN;
    close STDOUT;
    close STDERR;
    # don't exit... you still need to send the "quit" signal
#    exit(-1);
}

sub do_stream {
    # stream results back to execution top

    $SIG{INT} = \&stream_sigh;
    $SIG{TERM} = \&stream_sigh;
    $SIG{PIPE} = \&stream_sigh;

    # shift off the -X
	shift @ARGV;

    my $termio;

    # line by line?
    if ($ARGV[0] eq '-I') {
        $termio=1;
        shift @ARGV;
    }

    my $connect = shift @ARGV;
    my $key = shift @ARGV;
    my $data = shift @ARGV;
    my $sock = zmq_socket($context, ZMQ_DEALER);

    zmq_setsockopt($sock, ZMQ_LINGER, 1000);
    zmq_setsockopt($sock, ZMQ_HWM, 100);
#    zmq_setsockopt($sock, ZMQ_RCVTIMEO, 1000);

    xlog("debug", "stream $$ $connect $key");

    zmq_connect($sock,"tcp://$connect");

    xlog("debug", "stream sready $$ $connect $key");

    zmq_send($sock, "", 0, ZMQ_SNDMORE);
    zmq_send($sock, packcmd('sready', $key));

    my $ready=0;
    my $time=time();
    my $wait = 5;
    while (!$stream_quit && !$ready) {
        my $got = 0;
        zmq_poll([{
            socket=>$sock, events=>ZMQ_POLLIN, callback=> sub {
                my $ignore = zmq_recvmsg($sock);
                my $msg  = zmq_recvmsg($sock);
                my $data = zmq_msg_data($msg);
                $ready = $data=~/ready|quit/;
                $stream_quit = $data=~/quit/;
                $got = 1;
        }}],1000);
        if (!$stream_quit && !$got && (time() > ($time+$wait))) {
            # ask again ... are you ready for the stream
            zmq_send($sock, "", 0, ZMQ_SNDMORE);
            zmq_send($sock, packcmd('sready', $key));
            $time=time();
            $wait += 5;
            if ($wait > 3600) {
                xlog("debug", "stream abandon $$ $key\n");
                exit(0);
            }
        }
    }

    xlog("debug", "stream response $$ $key $data\n");

    my $sent = 1;
    while(!$stream_quit) {
        if ($sent) {
            if ($termio) {
                # line by line
                $_=<>;
            } else {
                # block by block
                read STDIN, $_, 4096;
            }
            if ($_ eq "") {
                $stream_quit = 1 ;
                last;
            }
            $sent=0; 
        }
        zmq_poll([{
            socket=>$sock, events=>ZMQ_POLLOUT, callback=> sub {
            my $ret;
            if ($ret=zmq_send($sock, "", 0, ZMQ_SNDMORE)) {
                xlog("debug", "stream error $ret $$ $key : $?/$!\n");
            } else {
                if (($ret=zmq_send($sock, packcmd('stream', $key, $_))) && $? ) {
                    xlog("debug", "stream error $ret $$ $key : $?/$!\n");
                } else {
                    # ok, that chunk of data went out 
                    $sent=1;
                }
            }
        }}]);
    }

    # let daddy know we're done

    zmq_send($sock, "", 0, ZMQ_SNDMORE);
    zmq_send($sock, packcmd("stream", "$key:end"));
    zmq_close($sock);
    zmq_term($context);

    xlog("debug", "stream exit $$ $key\n");

    exit(0);
}

sub debugging {
    return $conf{log_types}->{"debug"}
}

sub getjobstathash {
    my ($jid)=@_;
    if ( -s "$conf{spool}/jstat/$jid.stat" ) {
        my $stat = unpack_file("$conf{spool}/jstat/$jid.stat");
        if ( $stat && -e "$conf{spool}/jstat/$jid.dumped" ) {
            $stat->{dumped}=slurp("$conf{spool}/jstat/$jid.dumped");
        }
        return $stat;
    }
    if ( -s "$conf{spool}/jstat/$jid.stat-err" ) {
        my $stat = unpack_file("$conf{spool}/jstat/$jid.stat-err");
        if ( $stat && -e "$conf{spool}/jstat/$jid.dumped" ) {
            $stat->{dumped}=slurp("$conf{spool}/jstat/$jid.dumped");
        }
        return $stat;
    }
    return undef;
}

sub forkandgo {
    my ($zid, $sub, @args) = @_;

print("FORKING\n");

    if (!(my $pid=fork)) {
        my $out="";
        my $err="";
        if (! defined $pid) {
            $err = "Error: Can't fork";
            sendcmd($conf{bind}, $conf{port}, 'frep', {zid=>unpack("h*",$zid), out=>$err});
        } else {
            eval {
                zmq_fork_undef();
                $daemon=0;
                $context=zmq_init();
                $0="GRUN:fork-handler";
                # no accidental messages go out on this context
                $out=eval{&$sub(@args)};
                $err=$@;
                $out = $err if $err && !$out;
                while(length($out)>100000) {
                    sendcmd($conf{bind}, $conf{port}, 'frep', {zid=>unpack("h*",$zid), out=>substr($out,0,100000,""), more=>1});
                }
                sendcmd($conf{bind}, $conf{port}, 'frep', {zid=>unpack("h*",$zid), out=>$out});
            };
            exit(0);
        }
    }
}

sub archive_job {
    my ($jid, $job, $status, $ip, $usage) = @_;

    # double-check
    carp "Need a job ref" unless ref($job);
    carp "Need a usage ref" unless !defined($usage) || ref($usage);

    $status+=0;
    if ($usage) {
        delete $usage->{ip};
        delete $usage->{id};
        $usage->{status} = $status if defined $status;
        $job->{usage} = $usage;
    } else {
        $job->{usage}->{status} = $status if defined($status);
    }

    $job->{status} = $status;
    if ($ip) {
        $job->{host} = $nodes{$ip} && $nodes{$ip}->{hostname} ? $nodes{$ip}->{hostname} : $ip;
        $job->{hostip} = $ip;
    }

    for (keys(%{$j_wait{$jid}})) {
        replymsg($_, $job); 
    }
    delete $j_wait{$jid};

    my $jhistfile=jhistpath($jid);
    xlog("debug", "Writing history for $jid to $jhistfile");
    open(ST, ">$jhistfile");
    print ST packfile($job);
    close ST;

    xlog("debug", "Unlinking $conf{spool}/jobs/$jid:$ip.run, $conf{spool}/jobs/$jid.ip, $conf{spool}/jobs/$jid");

    if (!$ip) {
        $ip = slurp("$conf{spool}/jobs/$jid.ip");
    }
    unlink("$conf{spool}/jobs/$jid:$ip.run");
    unlink("$conf{spool}/jobs/$jid.ip");
    unlink("$conf{spool}/jobs/$jid.ok");
    unlink("$conf{spool}/jobs/$jid");
    if ($job->{guid}) {
# can't query by guid anymore, except maybe in a history database
        unlink("$conf{spool}/guids/$job->{guid}");
    }

    if ($start_wait{$jid}) {
		# info needed for status/stdio collection from execution node
		replymsg($start_wait{$jid}->{zid},jid=>$jid, status=>$status, error=>$job->{error}, hostname=>$job->{host}, ip=>$job->{host}?$job->{host}:"n/a");
		delete $start_wait{$jid};
	}
}

my $coder;
sub pretty_encode {
    $coder = JSON::XS->new->ascii->pretty->canonical->allow_blessed unless defined $coder;
    if (@_ > 1 || !ref(@_[0])) {
        $coder->encode([@_]);
    } else {
        $coder->encode(@_[0]);
    }
}

sub showmem {
    require PadWalker;
    my $o = PadWalker::peek_our(0);
    my $h = PadWalker::peek_my(1);
    for (keys(%$o)) {
        $h->{$_}=$o->{$_};
    }
    for (keys(%$h)) {
        $h->{$_}=${$h->{$_}} if ref($h->{$_}) eq 'SCALAR';
        $h->{$_}=${$h->{$_}} if ref($h->{$_}) eq 'REF';
    }
    return pretty_encode($h);
}

sub do_execute {
    # shift off the -Y
	shift @ARGV;
    my ($bfile) = @ARGV;

    my $opts = unpack_file("$bfile.job");
    my $uid = $opts->{uid};

    my $code = -1;
    my $start = Time::HiRes::time();

    my $shfile = "$bfile.sh";

    my @cmd = ("bash", $shfile);

    open(OLDERR, ">&STDERR");
    open(OLDOUT, ">&STDOUT");

    close(STDERR);
    close(STDOUT);

    my ($out_pid, $err_pid);

    $SIG{INT} =  $SIG{TERM} = sub {
        kill 2, $out_pid;
        kill 2, $err_pid;
    };

    my ($err, $ret);

    my $ok=1;	
    eval {
        if ($opts->{out}) {
            if ($opts->{out_a}) {
               $ok&&=open(STDOUT, ">>$opts->{out}");
            } else {
               $ok&&=open(STDOUT, ">$opts->{out}");
            }
            if ($opts->{out} eq $opts->{err}) {
                $ok&&=open(STDERR, ">&STDOUT");
            }
        }
        if ($opts->{err} && !(($opts->{out} eq $opts->{err}))) {
                if ($opts->{err_a}) {
                    $ok&&=open(STDERR, ">>$opts->{err}");
                } else {
                    $ok&&=open(STDERR, ">$opts->{err}");
                }
        }
        if ($opts->{io}) {
           my $confarg = "-C $conf{config}";                # same config!
           my $streamarg = "-X";                            # grun streamer
           $streamarg .= " -I" if $opts->{int};             # interactive mode
           if (!$opts->{out}) {
                my $cmd="/usr/bin/perl $GRUN_PATH $confarg $streamarg $conf{bind}:$conf{port} $opts->{id}:out";
                $out_pid=open(STDOUT, "|$cmd");
                $ok&&=$out_pid;
            }
            if (!$opts->{err}) {
                my $cmd="/usr/bin/perl $GRUN_PATH $confarg $streamarg $conf{bind}:$conf{port} $opts->{id}:err";
                $err_pid=open(STDERR, "|$cmd");
                $ok&&=$err_pid;
            }
        } else {
# save disk and time, i never want i/o
            if (!$opts->{err}) {
                $ok&&=open(STDERR, ">/dev/null");
            }
            if (!$opts->{out}) {
                $ok&&=open(STDOUT, ">/dev/null");
            }
        }
    };

    if ($@ || !$ok) {
        close(STDERR);
        close(STDOUT);
        open(STDERR, ">&OLDERR");
        open(STDOUT, ">&OLDOUT");
        if ($@) {
            $err=$@;
            $ret=109;
        } else {
            $err="Error opening i/o files: $!";
            $ret=109;
        }
        xlog("error", "$ret: $err\n");
    }	

    my $elapsed=0;

    # deal with nfs sync, if needed
    syncdirs(@{$opts->{syncdirs}}) if $opts->{syncdirs};

    if (!$err) {
        eval {
            $code = system(@cmd);
            $0="GRUN:$opts->{id}:$code";
        };
         $elapsed = Time::HiRes::time() - $start;
    } else {
        $code = $ret ? $ret : -1;
    }

    open(SAVERR, ">&STDERR");
    open(SAVOUT, ">&STDOUT");

    open(STDERR, ">&OLDERR");
    open(STDOUT, ">&OLDOUT");

    xlog("debug", "Job $opts->{id} original exit code is $code");

    $code = ($code == -1) ? $code : ($code & 127) ? $code & 127 : ($code >> 8);

    xlog("debug", "Done running job $opts->{id}, code $code as user $uid") if $opts->{trace};

    my ($utime, $stime,
     $maxrss, $ixrss, $idrss, $isrss, $minflt, $majflt, $nswap,
     $inblock, $oublock, $msgsnd, $msgrcv,
     $nsignals, $nvcsw, $nivcsw);

    eval {
        ($utime, $stime,
         $maxrss, $ixrss, $idrss, $isrss, $minflt, $majflt, $nswap,
         $inblock, $oublock, $msgsnd, $msgrcv,
         $nsignals, $nvcsw, $nivcsw) = getrusage(RUSAGE_CHILDREN);
    };

    xlog("debug", "Really done with job $opts->{id} (@cmd = $@)\n") if $opts->{trace};

    # $msgsnd, $msgrcv, $nsignals not used in Linux....

    my $out = {
        id=>$opts->{id},
        status=>$code,
        start_time=>$start,
        utime=>$utime,
        stime=>$stime,
        rtime=>$elapsed,
        pid=>$$,
        maxrss=>$maxrss,
        minflt=>$minflt,
        majflt=>$majflt,
        nswap=>$nswap,
        inblock=>$inblock,
        oublock=>$oublock,
        nvcsw=>$nvcsw,
        nivcsw=>$nivcsw
    };

    $out->{error} = $err if $code && $err;

    xlog("debug", "Writing output job $opts->{id}\n") if $opts->{trace};
    print XOUT packfile($out);
    close XOUT;
    xlog("debug", "Job $opts->{id} exit code $code\n") if $opts->{trace};
    send_status_for_job($out);
    exit $code;
# special exit code 101 means couldn't run the command
    close STDOUT; 
    close STDERR; 
    exit(0);
}

